@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#ifndef CLIP_H
|
|
2
2
|
#define CLIP_H
|
|
3
3
|
|
|
4
|
+
#include "ggml.h"
|
|
4
5
|
#include <stddef.h>
|
|
5
6
|
#include <stdint.h>
|
|
6
7
|
|
|
@@ -29,19 +30,13 @@ struct clip_image_size {
|
|
|
29
30
|
int height;
|
|
30
31
|
};
|
|
31
32
|
|
|
32
|
-
struct
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
};
|
|
36
|
-
|
|
37
|
-
struct clip_image_f32_batch {
|
|
38
|
-
struct clip_image_f32 * data;
|
|
39
|
-
size_t size;
|
|
40
|
-
};
|
|
33
|
+
struct clip_image_f32;
|
|
34
|
+
struct clip_image_u8_batch;
|
|
35
|
+
struct clip_image_f32_batch;
|
|
41
36
|
|
|
42
37
|
struct clip_context_params {
|
|
43
38
|
bool use_gpu;
|
|
44
|
-
|
|
39
|
+
enum ggml_log_level verbosity;
|
|
45
40
|
};
|
|
46
41
|
|
|
47
42
|
// deprecated, use clip_init
|
|
@@ -52,11 +47,11 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
|
|
|
52
47
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
53
48
|
|
|
54
49
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
55
|
-
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int
|
|
50
|
+
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
|
56
51
|
|
|
57
|
-
CLIP_API int32_t
|
|
58
|
-
CLIP_API int32_t
|
|
59
|
-
CLIP_API int32_t
|
|
52
|
+
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
|
53
|
+
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
|
54
|
+
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
|
60
55
|
|
|
61
56
|
// TODO: should be enum, not string
|
|
62
57
|
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
@@ -64,23 +59,45 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
|
64
59
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
65
60
|
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
|
66
61
|
|
|
67
|
-
CLIP_API int clip_n_patches
|
|
68
|
-
|
|
69
|
-
CLIP_API int
|
|
62
|
+
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
|
63
|
+
"use clip_n_output_tokens instead");
|
|
64
|
+
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
|
65
|
+
"use clip_n_output_tokens instead");
|
|
66
|
+
|
|
67
|
+
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
68
|
+
|
|
69
|
+
// for M-RoPE, this will be the number of token positions in X and Y directions
|
|
70
|
+
// for other models, X will be the total number of tokens and Y will be 1
|
|
71
|
+
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
72
|
+
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
73
|
+
|
|
74
|
+
// this should be equal to the embedding dimension of the text model
|
|
75
|
+
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
70
76
|
|
|
71
77
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
|
72
78
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
|
73
79
|
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
|
74
80
|
|
|
75
|
-
CLIP_API struct clip_image_size
|
|
76
|
-
CLIP_API struct clip_image_u8
|
|
77
|
-
CLIP_API struct clip_image_f32
|
|
81
|
+
CLIP_API struct clip_image_size * clip_image_size_init();
|
|
82
|
+
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
|
83
|
+
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
|
84
|
+
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
|
85
|
+
|
|
86
|
+
// nx, ny are the output image dimensions
|
|
87
|
+
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
|
78
88
|
|
|
89
|
+
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
|
|
79
90
|
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
|
80
91
|
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
|
81
92
|
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
82
93
|
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
|
83
94
|
|
|
95
|
+
// use for accessing underlay data of clip_image_f32_batch
|
|
96
|
+
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
|
97
|
+
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
|
98
|
+
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
|
99
|
+
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
|
100
|
+
|
|
84
101
|
/**
|
|
85
102
|
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
|
86
103
|
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
|
@@ -105,8 +122,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
|
|
|
105
122
|
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
106
123
|
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
|
107
124
|
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
108
|
-
|
|
109
|
-
CLIP_API
|
|
125
|
+
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
|
126
|
+
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
|
110
127
|
|
|
111
128
|
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
112
129
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#include <cstdio>
|
|
2
|
+
#include <string>
|
|
3
|
+
|
|
4
|
+
int main(int argc, char** argv) {
|
|
5
|
+
std::string filename = "main";
|
|
6
|
+
if (argc >= 1) {
|
|
7
|
+
filename = argv[0];
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// Get only the program name from the full path
|
|
11
|
+
size_t pos = filename.find_last_of("/\\");
|
|
12
|
+
if (pos != std::string::npos) {
|
|
13
|
+
filename = filename.substr(pos+1);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
fprintf(stdout, "\n");
|
|
17
|
+
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
|
18
|
+
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
|
|
19
|
+
fprintf(stdout, "\n");
|
|
20
|
+
|
|
21
|
+
return EXIT_FAILURE;
|
|
22
|
+
}
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <cstring>
|
|
11
11
|
#include <limits>
|
|
12
12
|
#include <vector>
|
|
13
|
+
#include <memory>
|
|
13
14
|
|
|
14
15
|
#if defined(LLAVA_LOG_OFF)
|
|
15
16
|
# define LOG_INF(...)
|
|
@@ -45,6 +46,17 @@ struct clip_image_grid_shape {
|
|
|
45
46
|
int second;
|
|
46
47
|
};
|
|
47
48
|
|
|
49
|
+
// convenience cpp wrapper
|
|
50
|
+
struct clip_image_f32_batch_deleter {
|
|
51
|
+
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
|
52
|
+
};
|
|
53
|
+
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
|
|
54
|
+
|
|
55
|
+
struct clip_image_size_deleter {
|
|
56
|
+
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
|
57
|
+
};
|
|
58
|
+
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
|
59
|
+
|
|
48
60
|
/**
|
|
49
61
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
|
50
62
|
*
|
|
@@ -100,13 +112,13 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
|
|
100
112
|
}
|
|
101
113
|
|
|
102
114
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
|
103
|
-
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
|
115
|
+
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
|
104
116
|
struct {
|
|
105
117
|
struct ggml_context * ctx;
|
|
106
118
|
} model;
|
|
107
119
|
|
|
108
|
-
const int32_t image_size =
|
|
109
|
-
const int32_t patch_size =
|
|
120
|
+
const int32_t image_size = clip_get_image_size(ctx_clip);
|
|
121
|
+
const int32_t patch_size = clip_get_patch_size(ctx_clip);
|
|
110
122
|
|
|
111
123
|
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
|
112
124
|
|
|
@@ -163,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
163
175
|
|
|
164
176
|
model.ctx = ggml_init(params);
|
|
165
177
|
|
|
166
|
-
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip),
|
|
178
|
+
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
|
167
179
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
|
168
180
|
// fill it with the image embeddings, ignoring the base
|
|
169
181
|
for (size_t i = 1; i < num_images; i++) {
|
|
@@ -202,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
202
214
|
|
|
203
215
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
|
204
216
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
|
205
|
-
memcpy(image_embd_out +
|
|
206
|
-
*n_img_pos_out = static_cast<int>(result->ne[1]+
|
|
217
|
+
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
|
218
|
+
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
|
207
219
|
|
|
208
220
|
// Debug: Test single segments
|
|
209
221
|
// Current findings: sending base image, sending a segment embedding all works similar to python
|
|
@@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
|
|
|
246
258
|
|
|
247
259
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
248
260
|
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
|
249
|
-
|
|
250
|
-
img_res_v.
|
|
251
|
-
img_res_v.data = nullptr;
|
|
252
|
-
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
|
261
|
+
clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
|
|
262
|
+
if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
|
|
253
263
|
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
|
254
|
-
delete[] img_res_v.data;
|
|
255
264
|
return false;
|
|
256
265
|
}
|
|
257
266
|
|
|
@@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
259
268
|
|
|
260
269
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
261
270
|
|
|
271
|
+
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
|
|
272
|
+
|
|
262
273
|
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
|
263
274
|
std::vector<float *> image_embd_v;
|
|
264
|
-
image_embd_v.resize(
|
|
265
|
-
|
|
275
|
+
image_embd_v.resize(n_imgs);
|
|
276
|
+
clip_image_size load_image_size;
|
|
266
277
|
|
|
267
|
-
for (size_t i = 0; i <
|
|
278
|
+
for (size_t i = 0; i < n_imgs; i++) {
|
|
268
279
|
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
|
269
|
-
|
|
270
|
-
int
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
280
|
+
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
|
281
|
+
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
|
282
|
+
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
|
283
|
+
int patch_size = 14;
|
|
284
|
+
load_image_size.width = nx;
|
|
285
|
+
load_image_size.height = ny;
|
|
286
|
+
clip_add_load_image_size(ctx_clip, &load_image_size);
|
|
274
287
|
|
|
275
288
|
bool encoded = false;
|
|
289
|
+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
276
290
|
if (clip_is_qwen2vl(ctx_clip)) {
|
|
277
|
-
encoded = clip_image_encode(ctx_clip, n_threads,
|
|
291
|
+
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
|
|
278
292
|
}
|
|
279
293
|
else {
|
|
280
|
-
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(
|
|
294
|
+
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
|
|
281
295
|
}
|
|
282
296
|
|
|
283
297
|
if (!encoded) {
|
|
284
|
-
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int)
|
|
298
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
|
285
299
|
return false;
|
|
286
300
|
}
|
|
287
301
|
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
|
288
|
-
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)
|
|
302
|
+
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
|
289
303
|
}
|
|
290
304
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
291
|
-
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)
|
|
305
|
+
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
292
306
|
|
|
293
307
|
int n_img_pos_out = 0;
|
|
294
308
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
309
|
+
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
|
310
|
+
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
|
311
|
+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
295
312
|
std::memcpy(
|
|
296
313
|
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
|
297
314
|
image_embd_v[i],
|
|
298
|
-
clip_embd_nbytes_by_img(ctx_clip,
|
|
299
|
-
n_img_pos_out +=
|
|
315
|
+
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
|
316
|
+
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
|
300
317
|
}
|
|
301
318
|
*n_img_pos = n_img_pos_out;
|
|
302
319
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
303
320
|
free(image_embd_v[i]);
|
|
304
321
|
}
|
|
305
322
|
image_embd_v.clear();
|
|
306
|
-
load_image_size
|
|
307
|
-
load_image_size
|
|
308
|
-
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
309
|
-
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size
|
|
310
|
-
delete[] img_res_v.data;
|
|
311
|
-
img_res_v.size = 0;
|
|
312
|
-
img_res_v.data = nullptr;
|
|
323
|
+
load_image_size.width = img->nx;
|
|
324
|
+
load_image_size.height = img->ny;
|
|
325
|
+
clip_add_load_image_size(ctx_clip, &load_image_size);
|
|
326
|
+
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
|
313
327
|
}
|
|
314
328
|
else if (clip_is_glm(ctx_clip)){
|
|
315
329
|
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
316
|
-
load_image_size->width
|
|
317
|
-
load_image_size->height = img_res_v.
|
|
330
|
+
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
|
|
331
|
+
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
|
|
318
332
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
319
333
|
|
|
320
|
-
|
|
321
|
-
|
|
334
|
+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
335
|
+
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
|
|
336
|
+
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
|
|
322
337
|
*n_img_pos = (pos * pos + 2);
|
|
323
338
|
if (!encoded){
|
|
324
339
|
LOG_ERR("Unable to encode image \n");
|
|
@@ -327,9 +342,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
327
342
|
}
|
|
328
343
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
329
344
|
// flat / default llava-1.5 type embedding
|
|
330
|
-
*
|
|
331
|
-
|
|
332
|
-
|
|
345
|
+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
346
|
+
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
|
347
|
+
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
|
333
348
|
if (!encoded) {
|
|
334
349
|
LOG_ERR("Unable to encode image\n");
|
|
335
350
|
|
|
@@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
340
355
|
// spatial_unpad llava-1.6 type embedding
|
|
341
356
|
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
|
342
357
|
std::vector<float *> image_embd_v;
|
|
343
|
-
image_embd_v.resize(
|
|
344
|
-
for (size_t i = 0; i <
|
|
358
|
+
image_embd_v.resize(n_imgs);
|
|
359
|
+
for (size_t i = 0; i < n_imgs; i++) {
|
|
360
|
+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
345
361
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
|
346
|
-
const bool encoded = clip_image_encode(ctx_clip, n_threads,
|
|
362
|
+
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
|
347
363
|
if (!encoded) {
|
|
348
|
-
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int)
|
|
364
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
|
349
365
|
return false;
|
|
350
366
|
}
|
|
351
367
|
}
|
|
352
368
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
353
|
-
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)
|
|
369
|
+
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
354
370
|
|
|
355
371
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
356
372
|
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
|
@@ -360,17 +376,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
360
376
|
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
|
361
377
|
}
|
|
362
378
|
|
|
363
|
-
|
|
364
|
-
delete[] img_res_v.data;
|
|
365
|
-
img_res_v.size = 0;
|
|
366
|
-
img_res_v.data = nullptr;
|
|
367
|
-
|
|
368
|
-
const int32_t image_size = clip_image_size(ctx_clip);
|
|
379
|
+
const int32_t image_size = clip_get_image_size(ctx_clip);
|
|
369
380
|
|
|
370
381
|
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
|
371
382
|
|
|
372
383
|
int n_img_pos_out;
|
|
373
|
-
|
|
384
|
+
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
385
|
+
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
|
374
386
|
*n_img_pos = n_img_pos_out;
|
|
375
387
|
|
|
376
388
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|