@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,13 +1,23 @@
|
|
|
1
1
|
#include "clip.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "llama.h"
|
|
4
2
|
#include "llava.h"
|
|
5
|
-
#include "base64.hpp"
|
|
6
3
|
|
|
4
|
+
#include "llama.h"
|
|
5
|
+
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <cerrno>
|
|
7
8
|
#include <cstdio>
|
|
8
9
|
#include <cstdlib>
|
|
10
|
+
#include <cstring>
|
|
11
|
+
#include <limits>
|
|
9
12
|
#include <vector>
|
|
10
|
-
|
|
13
|
+
|
|
14
|
+
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
|
15
|
+
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
16
|
+
|
|
17
|
+
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
18
|
+
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
19
|
+
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
20
|
+
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
11
21
|
|
|
12
22
|
// RGB uint8 image
|
|
13
23
|
struct clip_image_u8 {
|
|
@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
|
|
54
64
|
int downscaled_height = static_cast<int>(original_height * scale);
|
|
55
65
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
56
66
|
int wasted_resolution = (width * height) - effective_resolution;
|
|
57
|
-
//
|
|
67
|
+
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
58
68
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
59
69
|
max_effective_resolution = effective_resolution;
|
|
60
70
|
min_wasted_resolution = wasted_resolution;
|
|
@@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
184
194
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
|
185
195
|
ggml_build_forward_expand(gf, flatten);
|
|
186
196
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
|
187
|
-
struct ggml_tensor* result = gf
|
|
197
|
+
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
|
188
198
|
|
|
189
199
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
|
190
200
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
|
@@ -202,6 +212,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
202
212
|
return true;
|
|
203
213
|
}
|
|
204
214
|
|
|
215
|
+
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
|
216
|
+
int width = image->nx;
|
|
217
|
+
int height = image->ny;
|
|
218
|
+
int num_patches = (height / patch_size) * (width / patch_size);
|
|
219
|
+
clip_image_f32 * patch = clip_image_f32_init();
|
|
220
|
+
patch->nx = patch_size * num_patches;
|
|
221
|
+
patch->ny = patch_size;
|
|
222
|
+
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
223
|
+
|
|
224
|
+
int patch_index = 0;
|
|
225
|
+
|
|
226
|
+
for (int i = 0; i < height; i += patch_size) {
|
|
227
|
+
for (int j = 0; j < width; j += patch_size) {
|
|
228
|
+
for (int pi = 0; pi < patch_size; ++pi) {
|
|
229
|
+
for (int pj = 0; pj < patch_size; ++pj) {
|
|
230
|
+
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
|
231
|
+
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
|
232
|
+
patch->buf[output_index] = image->buf[input_index];
|
|
233
|
+
patch->buf[output_index+1] = image->buf[input_index+1];
|
|
234
|
+
patch->buf[output_index+2] = image->buf[input_index+2];
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
patch_index++;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
return patch;
|
|
241
|
+
}
|
|
205
242
|
|
|
206
243
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
207
244
|
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
|
@@ -209,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
209
246
|
img_res_v.size = 0;
|
|
210
247
|
img_res_v.data = nullptr;
|
|
211
248
|
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
|
212
|
-
|
|
249
|
+
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
|
213
250
|
delete[] img_res_v.data;
|
|
214
251
|
return false;
|
|
215
252
|
}
|
|
@@ -218,17 +255,62 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
218
255
|
|
|
219
256
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
220
257
|
|
|
221
|
-
if (
|
|
258
|
+
if (clip_is_minicpmv(ctx_clip)) {
|
|
259
|
+
std::vector<float *> image_embd_v;
|
|
260
|
+
image_embd_v.resize(img_res_v.size);
|
|
261
|
+
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
262
|
+
for (size_t i = 0; i < img_res_v.size; i++) {
|
|
263
|
+
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
|
264
|
+
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
|
265
|
+
int patch_size=14;
|
|
266
|
+
load_image_size->width = img_res_v.data[i].nx;
|
|
267
|
+
load_image_size->height = img_res_v.data[i].ny;
|
|
268
|
+
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
269
|
+
bool encoded = false;
|
|
270
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
|
271
|
+
if (has_minicpmv_projector == 2) {
|
|
272
|
+
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
|
273
|
+
}
|
|
274
|
+
else if (has_minicpmv_projector == 3) {
|
|
275
|
+
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
|
276
|
+
}
|
|
277
|
+
if (!encoded) {
|
|
278
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
279
|
+
return false;
|
|
280
|
+
}
|
|
281
|
+
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
|
282
|
+
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
|
283
|
+
}
|
|
284
|
+
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
285
|
+
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
286
|
+
|
|
287
|
+
int n_img_pos_out = 0;
|
|
288
|
+
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
289
|
+
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
|
290
|
+
n_img_pos_out += clip_n_patches(ctx_clip);
|
|
291
|
+
}
|
|
292
|
+
*n_img_pos = n_img_pos_out;
|
|
293
|
+
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
294
|
+
free(image_embd_v[i]);
|
|
295
|
+
}
|
|
296
|
+
image_embd_v.clear();
|
|
297
|
+
load_image_size->width = img->nx;
|
|
298
|
+
load_image_size->height = img->ny;
|
|
299
|
+
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
300
|
+
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
301
|
+
}
|
|
302
|
+
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
222
303
|
// flat / default llava-1.5 type embedding
|
|
223
304
|
*n_img_pos = clip_n_patches(ctx_clip);
|
|
224
305
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
|
225
306
|
delete[] img_res_v.data;
|
|
226
307
|
if (!encoded) {
|
|
227
|
-
|
|
308
|
+
LOG_ERR("Unable to encode image\n");
|
|
228
309
|
|
|
229
310
|
return false;
|
|
230
311
|
}
|
|
231
|
-
}
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
232
314
|
// spatial_unpad llava-1.6 type embedding
|
|
233
315
|
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
|
234
316
|
std::vector<float *> image_embd_v;
|
|
@@ -237,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
237
319
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
|
238
320
|
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
|
239
321
|
if (!encoded) {
|
|
240
|
-
|
|
322
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
241
323
|
return false;
|
|
242
324
|
}
|
|
243
325
|
}
|
|
244
326
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
245
|
-
|
|
327
|
+
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
246
328
|
|
|
247
329
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
248
330
|
|
|
@@ -275,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
275
357
|
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
|
276
358
|
}
|
|
277
359
|
|
|
278
|
-
|
|
360
|
+
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
|
279
361
|
|
|
280
362
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
|
281
363
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
282
364
|
|
|
283
|
-
|
|
365
|
+
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
|
284
366
|
|
|
285
367
|
return true;
|
|
286
368
|
}
|
|
@@ -290,22 +372,26 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|
|
290
372
|
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
291
373
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
|
292
374
|
if (n_image_embd != n_llama_embd) {
|
|
293
|
-
|
|
375
|
+
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
|
294
376
|
return false;
|
|
295
377
|
}
|
|
296
378
|
return true;
|
|
297
379
|
}
|
|
298
380
|
|
|
299
381
|
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
300
|
-
|
|
382
|
+
int num_max_patches = 6;
|
|
383
|
+
if (clip_is_minicpmv(ctx_clip)) {
|
|
384
|
+
num_max_patches = 10;
|
|
385
|
+
}
|
|
386
|
+
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
|
301
387
|
if (!image_embd) {
|
|
302
|
-
|
|
388
|
+
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
|
303
389
|
return false;
|
|
304
390
|
}
|
|
305
391
|
|
|
306
392
|
int n_img_pos;
|
|
307
393
|
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
|
308
|
-
|
|
394
|
+
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
|
309
395
|
free(image_embd);
|
|
310
396
|
return false;
|
|
311
397
|
}
|
|
@@ -315,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|
|
315
401
|
return true;
|
|
316
402
|
}
|
|
317
403
|
|
|
404
|
+
struct llava_embd_batch {
|
|
405
|
+
std::vector<llama_pos> pos;
|
|
406
|
+
std::vector<int32_t> n_seq_id;
|
|
407
|
+
std::vector<llama_seq_id> seq_id_0;
|
|
408
|
+
std::vector<llama_seq_id *> seq_ids;
|
|
409
|
+
std::vector<int8_t> logits;
|
|
410
|
+
llama_batch batch;
|
|
411
|
+
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
412
|
+
pos .resize(n_tokens);
|
|
413
|
+
n_seq_id.resize(n_tokens);
|
|
414
|
+
seq_ids .resize(n_tokens + 1);
|
|
415
|
+
logits .resize(n_tokens);
|
|
416
|
+
seq_id_0.resize(1);
|
|
417
|
+
seq_id_0[0] = seq_id;
|
|
418
|
+
seq_ids [n_tokens] = nullptr;
|
|
419
|
+
batch = {
|
|
420
|
+
/*n_tokens =*/ n_tokens,
|
|
421
|
+
/*tokens =*/ nullptr,
|
|
422
|
+
/*embd =*/ embd,
|
|
423
|
+
/*pos =*/ pos.data(),
|
|
424
|
+
/*n_seq_id =*/ n_seq_id.data(),
|
|
425
|
+
/*seq_id =*/ seq_ids.data(),
|
|
426
|
+
/*logits =*/ logits.data(),
|
|
427
|
+
};
|
|
428
|
+
for (int i = 0; i < n_tokens; i++) {
|
|
429
|
+
batch.pos [i] = pos_0 + i;
|
|
430
|
+
batch.n_seq_id[i] = 1;
|
|
431
|
+
batch.seq_id [i] = seq_id_0.data();
|
|
432
|
+
batch.logits [i] = false;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
};
|
|
436
|
+
|
|
318
437
|
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
|
|
319
438
|
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
320
439
|
|
|
@@ -323,9 +442,10 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|
|
323
442
|
if (n_eval > n_batch) {
|
|
324
443
|
n_eval = n_batch;
|
|
325
444
|
}
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
445
|
+
float * embd = image_embed->embed+i*n_embd;
|
|
446
|
+
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
|
447
|
+
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
|
448
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
329
449
|
return false;
|
|
330
450
|
}
|
|
331
451
|
*n_past += n_eval;
|
|
@@ -337,7 +457,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
337
457
|
clip_image_u8 * img = clip_image_u8_init();
|
|
338
458
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
|
339
459
|
clip_image_u8_free(img);
|
|
340
|
-
|
|
460
|
+
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
|
341
461
|
return NULL;
|
|
342
462
|
}
|
|
343
463
|
|
|
@@ -346,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
346
466
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
|
347
467
|
if (!image_embed_result) {
|
|
348
468
|
clip_image_u8_free(img);
|
|
349
|
-
|
|
469
|
+
LOG_ERR("%s: couldn't embed the image\n", __func__);
|
|
350
470
|
return NULL;
|
|
351
471
|
}
|
|
352
472
|
|
|
@@ -360,7 +480,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
360
480
|
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
|
361
481
|
auto file = fopen(path, "rb");
|
|
362
482
|
if (file == NULL) {
|
|
363
|
-
|
|
483
|
+
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
|
364
484
|
return false;
|
|
365
485
|
}
|
|
366
486
|
|
|
@@ -370,7 +490,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
|
|
370
490
|
|
|
371
491
|
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
|
372
492
|
if (buffer == NULL) {
|
|
373
|
-
|
|
493
|
+
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
|
374
494
|
perror("Memory allocation error");
|
|
375
495
|
fclose(file);
|
|
376
496
|
return false;
|
|
@@ -395,7 +515,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
|
|
395
515
|
long image_bytes_length;
|
|
396
516
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
|
397
517
|
if (!loaded) {
|
|
398
|
-
|
|
518
|
+
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
|
399
519
|
return NULL;
|
|
400
520
|
}
|
|
401
521
|
|
|
@@ -17,12 +17,11 @@
|
|
|
17
17
|
# define LLAVA_API
|
|
18
18
|
#endif
|
|
19
19
|
|
|
20
|
-
struct clip_ctx;
|
|
21
|
-
|
|
22
20
|
#ifdef __cplusplus
|
|
23
21
|
extern "C" {
|
|
24
22
|
#endif
|
|
25
23
|
|
|
24
|
+
struct clip_ctx;
|
|
26
25
|
struct llava_image_embed {
|
|
27
26
|
float * embed;
|
|
28
27
|
int n_image_pos;
|
|
@@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
|
|
|
37
36
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
|
38
37
|
/** build an image embed from a path to an image filename */
|
|
39
38
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
|
40
|
-
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
41
39
|
/** free an embedding made with llava_image_embed_make_* */
|
|
40
|
+
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
42
41
|
|
|
43
42
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
|
44
43
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "log.h"
|
|
3
|
+
#include "common.h"
|
|
4
|
+
#include "sampling.h"
|
|
5
|
+
#include "clip.h"
|
|
6
|
+
#include "llava.h"
|
|
7
|
+
#include "llama.h"
|
|
8
|
+
#include "ggml.h"
|
|
9
|
+
|
|
10
|
+
#include <algorithm>
|
|
11
|
+
#include <cstdio>
|
|
12
|
+
#include <cstdlib>
|
|
13
|
+
#include <cstring>
|
|
14
|
+
#include <vector>
|
|
15
|
+
#include <iostream> // TODO: remove me
|
|
16
|
+
|
|
17
|
+
struct llava_context {
|
|
18
|
+
struct clip_ctx * ctx_clip = NULL;
|
|
19
|
+
struct llama_context * ctx_llama = NULL;
|
|
20
|
+
struct llama_model * model = NULL;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
24
|
+
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
25
|
+
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
static struct llama_model * llava_init(common_params * params) {
|
|
29
|
+
llama_backend_init();
|
|
30
|
+
llama_numa_init(params->numa);
|
|
31
|
+
|
|
32
|
+
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
33
|
+
|
|
34
|
+
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
|
35
|
+
if (model == NULL) {
|
|
36
|
+
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
37
|
+
return NULL;
|
|
38
|
+
}
|
|
39
|
+
return model;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
|
43
|
+
auto prompt = params->prompt;
|
|
44
|
+
if (prompt.empty()) {
|
|
45
|
+
prompt = "describe the image in detail.";
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
49
|
+
if (params->n_ctx < 2048) {
|
|
50
|
+
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
|
51
|
+
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
|
52
|
+
ctx_params.n_ctx = 2048;
|
|
53
|
+
} else {
|
|
54
|
+
ctx_params.n_ctx = params->n_ctx;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
|
58
|
+
|
|
59
|
+
if (ctx_llama == NULL) {
|
|
60
|
+
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
61
|
+
return NULL;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
65
|
+
|
|
66
|
+
ctx_llava->ctx_llama = ctx_llama;
|
|
67
|
+
ctx_llava->model = model;
|
|
68
|
+
return ctx_llava;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
static void llava_free(struct llava_context * ctx_llava) {
|
|
72
|
+
if (ctx_llava->ctx_clip) {
|
|
73
|
+
clip_free(ctx_llava->ctx_clip);
|
|
74
|
+
ctx_llava->ctx_clip = NULL;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
llama_free(ctx_llava->ctx_llama);
|
|
78
|
+
llama_free_model(ctx_llava->model);
|
|
79
|
+
llama_backend_free();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
static struct clip_ctx * clip_init_context(common_params * params) {
|
|
83
|
+
const char * clip_path = params->mmproj.c_str();
|
|
84
|
+
|
|
85
|
+
auto prompt = params->prompt;
|
|
86
|
+
if (prompt.empty()) {
|
|
87
|
+
prompt = "describe the image in detail.";
|
|
88
|
+
}
|
|
89
|
+
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
90
|
+
return ctx_clip;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
|
94
|
+
int N = (int) tokens.size();
|
|
95
|
+
for (int i = 0; i < N; i += n_batch) {
|
|
96
|
+
int n_eval = (int) tokens.size() - i;
|
|
97
|
+
if (n_eval > n_batch) {
|
|
98
|
+
n_eval = n_batch;
|
|
99
|
+
}
|
|
100
|
+
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
|
101
|
+
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
*n_past += n_eval;
|
|
105
|
+
}
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
|
110
|
+
std::vector<llama_token> tokens;
|
|
111
|
+
tokens.push_back(id);
|
|
112
|
+
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
|
116
|
+
std::string str2 = str;
|
|
117
|
+
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
|
118
|
+
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
|
122
|
+
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
|
123
|
+
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
|
124
|
+
|
|
125
|
+
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
|
126
|
+
slice_embed->embed = image_embed;
|
|
127
|
+
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
|
128
|
+
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
|
129
|
+
llava_image_embed_free(slice_embed);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
|
|
133
|
+
std::string system_prompt;
|
|
134
|
+
int idx = 0;
|
|
135
|
+
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
|
136
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
|
137
|
+
if (has_minicpmv_projector == 2) {
|
|
138
|
+
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
|
139
|
+
}
|
|
140
|
+
else if (has_minicpmv_projector == 3) {
|
|
141
|
+
system_prompt = "<|im_start|>user\n";
|
|
142
|
+
}
|
|
143
|
+
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
|
144
|
+
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
|
145
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
146
|
+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
147
|
+
if (num_image_embeds > 1) {
|
|
148
|
+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
|
149
|
+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
|
150
|
+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
|
151
|
+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
|
152
|
+
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
|
153
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
154
|
+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
155
|
+
if (j == num_image_embeds_col - 1) {
|
|
156
|
+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
161
|
+
}
|
|
162
|
+
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static const char * sample(struct common_sampler * smpl,
|
|
166
|
+
struct llama_context * ctx_llama,
|
|
167
|
+
int * n_past) {
|
|
168
|
+
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
169
|
+
common_sampler_accept(smpl, id, true);
|
|
170
|
+
static std::string ret;
|
|
171
|
+
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
|
172
|
+
ret = "</s>";
|
|
173
|
+
} else {
|
|
174
|
+
ret = common_token_to_piece(ctx_llama, id);
|
|
175
|
+
}
|
|
176
|
+
eval_id(ctx_llama, id, n_past);
|
|
177
|
+
return ret.c_str();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
|
|
181
|
+
auto * ctx_clip = clip_init_context(params);
|
|
182
|
+
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
|
183
|
+
if (!embeds) {
|
|
184
|
+
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
|
185
|
+
return NULL;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// process the prompt
|
|
189
|
+
if (params->prompt.empty() && params->interactive == false) {
|
|
190
|
+
LOG_ERR("prompt should be given or interactive mode should be on");
|
|
191
|
+
return NULL;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
auto * model = llava_init(params);
|
|
195
|
+
if (model == NULL) {
|
|
196
|
+
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
|
197
|
+
return NULL;
|
|
198
|
+
}
|
|
199
|
+
const int64_t t_llava_init_start_us = ggml_time_us();
|
|
200
|
+
auto * ctx_llava = llava_init_context(params, model);
|
|
201
|
+
ctx_llava->ctx_clip = ctx_clip;
|
|
202
|
+
const int64_t t_llava_init_end_us = ggml_time_us();
|
|
203
|
+
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
|
204
|
+
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
|
205
|
+
|
|
206
|
+
const int64_t t_process_image_start_us = ggml_time_us();
|
|
207
|
+
process_image(ctx_llava, embeds, params, n_past);
|
|
208
|
+
const int64_t t_process_image_end_us = ggml_time_us();
|
|
209
|
+
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
|
210
|
+
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
|
211
|
+
|
|
212
|
+
llava_image_embed_free(embeds);
|
|
213
|
+
return ctx_llava;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
|
217
|
+
std::string user_prompt = prompt;
|
|
218
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
|
219
|
+
if (!is_first) {
|
|
220
|
+
if (has_minicpmv_projector == 2) {
|
|
221
|
+
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
|
222
|
+
}
|
|
223
|
+
else if (has_minicpmv_projector == 3) {
|
|
224
|
+
user_prompt = "<|im_start|>user\n" + prompt;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
|
229
|
+
if (has_minicpmv_projector == 2) {
|
|
230
|
+
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
|
231
|
+
}
|
|
232
|
+
else if (has_minicpmv_projector == 3) {
|
|
233
|
+
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// generate the response
|
|
237
|
+
|
|
238
|
+
LOG_INF("\n");
|
|
239
|
+
|
|
240
|
+
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
|
241
|
+
return smpl;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
|
|
245
|
+
|
|
246
|
+
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
|
247
|
+
return tmp;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
int main(int argc, char ** argv) {
|
|
251
|
+
ggml_time_init();
|
|
252
|
+
|
|
253
|
+
common_params params;
|
|
254
|
+
|
|
255
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
|
256
|
+
return 1;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
common_init();
|
|
260
|
+
|
|
261
|
+
if (params.mmproj.empty() || (params.image.empty())) {
|
|
262
|
+
show_additional_info(argc, argv);
|
|
263
|
+
return 1;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
for (auto & image : params.image) {
|
|
267
|
+
int n_past = 0;
|
|
268
|
+
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
|
269
|
+
|
|
270
|
+
if (!params.prompt.empty()) {
|
|
271
|
+
LOG("<user>%s\n", params.prompt.c_str());
|
|
272
|
+
LOG("<assistant>");
|
|
273
|
+
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
|
274
|
+
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
|
275
|
+
std::string response;
|
|
276
|
+
bool have_tmp = false;
|
|
277
|
+
for (int i = 0; i < max_tgt_len; i++) {
|
|
278
|
+
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
|
279
|
+
response += tmp;
|
|
280
|
+
if (strcmp(tmp, "</s>") == 0){
|
|
281
|
+
if (!have_tmp) {
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
break;
|
|
285
|
+
}
|
|
286
|
+
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
287
|
+
have_tmp = true;
|
|
288
|
+
printf("%s", tmp);
|
|
289
|
+
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
|
290
|
+
|
|
291
|
+
fflush(stdout);
|
|
292
|
+
}
|
|
293
|
+
common_sampler_free(smpl);
|
|
294
|
+
}else {
|
|
295
|
+
while (true) {
|
|
296
|
+
LOG("<user>");
|
|
297
|
+
std::string prompt;
|
|
298
|
+
std::getline(std::cin, prompt);
|
|
299
|
+
LOG("<assistant>");
|
|
300
|
+
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
|
301
|
+
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
|
302
|
+
std::string response;
|
|
303
|
+
for (int i = 0; i < max_tgt_len; i++) {
|
|
304
|
+
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
|
305
|
+
response += tmp;
|
|
306
|
+
if (strcmp(tmp, "</s>") == 0) break;
|
|
307
|
+
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
308
|
+
printf("%s", tmp);// mistral llava-1.6
|
|
309
|
+
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
|
310
|
+
fflush(stdout);
|
|
311
|
+
}
|
|
312
|
+
common_sampler_free(smpl);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
printf("\n");
|
|
316
|
+
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
317
|
+
|
|
318
|
+
ctx_llava->model = NULL;
|
|
319
|
+
llava_free(ctx_llava);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return 0;
|
|
323
|
+
}
|