@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -18,14 +18,17 @@
|
|
|
18
18
|
# define CLIP_API
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
-
struct clip_ctx;
|
|
22
|
-
|
|
23
21
|
#ifdef __cplusplus
|
|
24
22
|
extern "C" {
|
|
25
23
|
#endif
|
|
26
24
|
|
|
27
25
|
struct clip_ctx;
|
|
28
26
|
|
|
27
|
+
struct clip_image_size {
|
|
28
|
+
int width;
|
|
29
|
+
int height;
|
|
30
|
+
};
|
|
31
|
+
|
|
29
32
|
struct clip_image_u8_batch {
|
|
30
33
|
struct clip_image_u8 * data;
|
|
31
34
|
size_t size;
|
|
@@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
|
55
58
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
|
56
59
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
57
60
|
|
|
61
|
+
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
|
62
|
+
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
|
63
|
+
|
|
64
|
+
CLIP_API struct clip_image_size * clip_image_size_init();
|
|
58
65
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
|
59
66
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
|
60
67
|
|
|
@@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
|
|
78
85
|
|
|
79
86
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
|
80
87
|
|
|
88
|
+
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
89
|
+
|
|
81
90
|
#ifdef __cplusplus
|
|
82
91
|
}
|
|
83
92
|
#endif
|
|
@@ -1,14 +1,16 @@
|
|
|
1
|
-
#include "
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "base64.hpp"
|
|
2
3
|
#include "log.h"
|
|
3
4
|
#include "common.h"
|
|
5
|
+
#include "sampling.h"
|
|
4
6
|
#include "clip.h"
|
|
5
7
|
#include "llava.h"
|
|
6
8
|
#include "llama.h"
|
|
7
|
-
|
|
8
|
-
#include "base64.hpp"
|
|
9
|
+
#include "ggml.h"
|
|
9
10
|
|
|
10
11
|
#include <cstdio>
|
|
11
12
|
#include <cstdlib>
|
|
13
|
+
#include <cstring>
|
|
12
14
|
#include <vector>
|
|
13
15
|
|
|
14
16
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
|
@@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|
|
19
21
|
n_eval = n_batch;
|
|
20
22
|
}
|
|
21
23
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
|
22
|
-
|
|
24
|
+
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
23
25
|
return false;
|
|
24
26
|
}
|
|
25
27
|
*n_past += n_eval;
|
|
@@ -40,11 +42,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
|
|
40
42
|
return true;
|
|
41
43
|
}
|
|
42
44
|
|
|
43
|
-
static const char * sample(struct
|
|
45
|
+
static const char * sample(struct gpt_sampler * smpl,
|
|
44
46
|
struct llama_context * ctx_llama,
|
|
45
47
|
int * n_past) {
|
|
46
|
-
const llama_token id =
|
|
47
|
-
|
|
48
|
+
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
|
49
|
+
gpt_sampler_accept(smpl, id, true);
|
|
48
50
|
static std::string ret;
|
|
49
51
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
|
50
52
|
ret = "</s>";
|
|
@@ -74,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|
|
74
76
|
size_t img_base64_str_start, img_base64_str_end;
|
|
75
77
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
|
76
78
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
|
77
|
-
|
|
79
|
+
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
|
78
80
|
return NULL;
|
|
79
81
|
}
|
|
80
82
|
|
|
@@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|
|
88
90
|
|
|
89
91
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
|
90
92
|
if (!embed) {
|
|
91
|
-
|
|
93
|
+
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
|
92
94
|
return NULL;
|
|
93
95
|
}
|
|
94
96
|
|
|
@@ -112,12 +114,10 @@ struct llava_context {
|
|
|
112
114
|
struct llama_model * model = NULL;
|
|
113
115
|
};
|
|
114
116
|
|
|
115
|
-
static void print_usage(int
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
120
|
-
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
117
|
+
static void print_usage(int, char ** argv) {
|
|
118
|
+
LOG("\n example usage:\n");
|
|
119
|
+
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
120
|
+
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
121
121
|
}
|
|
122
122
|
|
|
123
123
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
|
@@ -127,16 +127,16 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|
|
127
127
|
auto prompt = params->prompt;
|
|
128
128
|
if (prompt_contains_image(prompt)) {
|
|
129
129
|
if (!params->image.empty()) {
|
|
130
|
-
|
|
130
|
+
LOG_INF("using base64 encoded image instead of command line image path\n");
|
|
131
131
|
}
|
|
132
|
-
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
|
132
|
+
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
|
133
133
|
if (!embed) {
|
|
134
|
-
|
|
134
|
+
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
|
135
135
|
return NULL;
|
|
136
136
|
}
|
|
137
137
|
params->prompt = remove_image_from_prompt(prompt);
|
|
138
138
|
} else {
|
|
139
|
-
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
|
139
|
+
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
|
140
140
|
if (!embed) {
|
|
141
141
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
|
142
142
|
return NULL;
|
|
@@ -157,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
157
157
|
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
|
158
158
|
system_prompt = prompt.substr(0, image_pos);
|
|
159
159
|
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
|
160
|
-
|
|
160
|
+
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
|
161
161
|
if (params->verbose_prompt) {
|
|
162
162
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
|
163
163
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
164
|
-
|
|
164
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
|
-
|
|
167
|
+
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
|
168
168
|
if (params->verbose_prompt) {
|
|
169
169
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
170
170
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
171
|
-
|
|
171
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
} else {
|
|
@@ -178,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
178
178
|
if (params->verbose_prompt) {
|
|
179
179
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
180
180
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
181
|
-
|
|
181
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
182
182
|
}
|
|
183
183
|
}
|
|
184
184
|
}
|
|
@@ -189,21 +189,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
189
189
|
|
|
190
190
|
// generate the response
|
|
191
191
|
|
|
192
|
-
|
|
192
|
+
LOG("\n");
|
|
193
193
|
|
|
194
|
-
struct
|
|
195
|
-
if (!
|
|
196
|
-
|
|
194
|
+
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
|
195
|
+
if (!smpl) {
|
|
196
|
+
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
197
197
|
exit(1);
|
|
198
198
|
}
|
|
199
199
|
|
|
200
200
|
std::string response = "";
|
|
201
201
|
for (int i = 0; i < max_tgt_len; i++) {
|
|
202
|
-
const char * tmp = sample(
|
|
202
|
+
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
|
203
203
|
response += tmp;
|
|
204
204
|
if (strcmp(tmp, "</s>") == 0) break;
|
|
205
205
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
206
|
-
|
|
206
|
+
LOG("%s", tmp);
|
|
207
207
|
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
|
208
208
|
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
|
209
209
|
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
|
@@ -211,8 +211,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
211
211
|
fflush(stdout);
|
|
212
212
|
}
|
|
213
213
|
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
gpt_sampler_free(smpl);
|
|
215
|
+
LOG("\n");
|
|
216
216
|
}
|
|
217
217
|
|
|
218
218
|
static struct llama_model * llava_init(gpt_params * params) {
|
|
@@ -223,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
|
|
223
223
|
|
|
224
224
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
|
225
225
|
if (model == NULL) {
|
|
226
|
-
|
|
226
|
+
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
227
227
|
return NULL;
|
|
228
228
|
}
|
|
229
229
|
return model;
|
|
@@ -246,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|
|
246
246
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
|
247
247
|
|
|
248
248
|
if (ctx_llama == NULL) {
|
|
249
|
-
|
|
249
|
+
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
250
250
|
return NULL;
|
|
251
251
|
}
|
|
252
252
|
|
|
253
|
-
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
253
|
+
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
254
254
|
|
|
255
255
|
ctx_llava->ctx_llama = ctx_llama;
|
|
256
256
|
ctx_llava->ctx_clip = ctx_clip;
|
|
@@ -269,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|
|
269
269
|
llama_backend_free();
|
|
270
270
|
}
|
|
271
271
|
|
|
272
|
-
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
|
273
|
-
(void) level;
|
|
274
|
-
(void) user_data;
|
|
275
|
-
LOG_TEE("%s", text);
|
|
276
|
-
}
|
|
277
|
-
|
|
278
272
|
int main(int argc, char ** argv) {
|
|
279
273
|
ggml_time_init();
|
|
280
274
|
|
|
281
275
|
gpt_params params;
|
|
282
276
|
|
|
283
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
284
|
-
print_usage(argc, argv, params);
|
|
277
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
|
285
278
|
return 1;
|
|
286
279
|
}
|
|
287
280
|
|
|
288
|
-
|
|
289
|
-
log_set_target(log_filename_generator("llava", "log"));
|
|
290
|
-
LOG_TEE("Log start\n");
|
|
291
|
-
log_dump_cmdline(argc, argv);
|
|
292
|
-
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
293
|
-
#endif // LOG_DISABLE_LOGS
|
|
281
|
+
gpt_init();
|
|
294
282
|
|
|
295
283
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
296
|
-
print_usage(argc, argv
|
|
284
|
+
print_usage(argc, argv);
|
|
297
285
|
return 1;
|
|
298
286
|
}
|
|
299
|
-
|
|
287
|
+
|
|
288
|
+
auto * model = llava_init(¶ms);
|
|
300
289
|
if (model == NULL) {
|
|
301
290
|
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
|
302
291
|
return 1;
|
|
303
292
|
}
|
|
304
293
|
|
|
305
294
|
if (prompt_contains_image(params.prompt)) {
|
|
306
|
-
auto ctx_llava = llava_init_context(¶ms, model);
|
|
295
|
+
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
307
296
|
|
|
308
|
-
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
|
297
|
+
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
|
309
298
|
|
|
310
299
|
// process the prompt
|
|
311
300
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
312
301
|
|
|
313
|
-
|
|
302
|
+
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
314
303
|
llava_image_embed_free(image_embed);
|
|
315
304
|
ctx_llava->model = NULL;
|
|
316
305
|
llava_free(ctx_llava);
|
|
317
306
|
} else {
|
|
318
307
|
for (auto & image : params.image) {
|
|
319
|
-
auto ctx_llava = llava_init_context(¶ms, model);
|
|
308
|
+
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
320
309
|
|
|
321
|
-
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
|
310
|
+
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
|
322
311
|
if (!image_embed) {
|
|
323
|
-
|
|
312
|
+
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
|
324
313
|
return 1;
|
|
325
314
|
}
|
|
326
315
|
|
|
327
316
|
// process the prompt
|
|
328
317
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
329
318
|
|
|
330
|
-
|
|
319
|
+
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
331
320
|
llava_image_embed_free(image_embed);
|
|
332
321
|
ctx_llava->model = NULL;
|
|
333
322
|
llava_free(ctx_llava);
|
|
@@ -1,13 +1,23 @@
|
|
|
1
1
|
#include "clip.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "llama.h"
|
|
4
2
|
#include "llava.h"
|
|
5
|
-
#include "base64.hpp"
|
|
6
3
|
|
|
4
|
+
#include "llama.h"
|
|
5
|
+
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <cerrno>
|
|
7
8
|
#include <cstdio>
|
|
8
9
|
#include <cstdlib>
|
|
10
|
+
#include <cstring>
|
|
11
|
+
#include <limits>
|
|
9
12
|
#include <vector>
|
|
10
|
-
|
|
13
|
+
|
|
14
|
+
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
|
15
|
+
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
16
|
+
|
|
17
|
+
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
18
|
+
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
19
|
+
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
20
|
+
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
11
21
|
|
|
12
22
|
// RGB uint8 image
|
|
13
23
|
struct clip_image_u8 {
|
|
@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
|
|
54
64
|
int downscaled_height = static_cast<int>(original_height * scale);
|
|
55
65
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
56
66
|
int wasted_resolution = (width * height) - effective_resolution;
|
|
57
|
-
//
|
|
67
|
+
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
58
68
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
59
69
|
max_effective_resolution = effective_resolution;
|
|
60
70
|
min_wasted_resolution = wasted_resolution;
|
|
@@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
184
194
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
|
185
195
|
ggml_build_forward_expand(gf, flatten);
|
|
186
196
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
|
187
|
-
struct ggml_tensor* result = gf
|
|
197
|
+
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
|
188
198
|
|
|
189
199
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
|
190
200
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
|
@@ -202,6 +212,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
202
212
|
return true;
|
|
203
213
|
}
|
|
204
214
|
|
|
215
|
+
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
|
216
|
+
int width = image->nx;
|
|
217
|
+
int height = image->ny;
|
|
218
|
+
int num_patches = (height / patch_size) * (width / patch_size);
|
|
219
|
+
clip_image_f32 * patch = clip_image_f32_init();
|
|
220
|
+
patch->nx = patch_size * num_patches;
|
|
221
|
+
patch->ny = patch_size;
|
|
222
|
+
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
223
|
+
|
|
224
|
+
int patch_index = 0;
|
|
225
|
+
|
|
226
|
+
for (int i = 0; i < height; i += patch_size) {
|
|
227
|
+
for (int j = 0; j < width; j += patch_size) {
|
|
228
|
+
for (int pi = 0; pi < patch_size; ++pi) {
|
|
229
|
+
for (int pj = 0; pj < patch_size; ++pj) {
|
|
230
|
+
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
|
231
|
+
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
|
232
|
+
patch->buf[output_index] = image->buf[input_index];
|
|
233
|
+
patch->buf[output_index+1] = image->buf[input_index+1];
|
|
234
|
+
patch->buf[output_index+2] = image->buf[input_index+2];
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
patch_index++;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
return patch;
|
|
241
|
+
}
|
|
205
242
|
|
|
206
243
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
207
244
|
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
|
@@ -209,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
209
246
|
img_res_v.size = 0;
|
|
210
247
|
img_res_v.data = nullptr;
|
|
211
248
|
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
|
212
|
-
|
|
249
|
+
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
|
213
250
|
delete[] img_res_v.data;
|
|
214
251
|
return false;
|
|
215
252
|
}
|
|
@@ -218,17 +255,62 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
218
255
|
|
|
219
256
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
220
257
|
|
|
221
|
-
if (
|
|
258
|
+
if (clip_is_minicpmv(ctx_clip)) {
|
|
259
|
+
std::vector<float *> image_embd_v;
|
|
260
|
+
image_embd_v.resize(img_res_v.size);
|
|
261
|
+
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
262
|
+
for (size_t i = 0; i < img_res_v.size; i++) {
|
|
263
|
+
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
|
264
|
+
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
|
265
|
+
int patch_size=14;
|
|
266
|
+
load_image_size->width = img_res_v.data[i].nx;
|
|
267
|
+
load_image_size->height = img_res_v.data[i].ny;
|
|
268
|
+
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
269
|
+
bool encoded = false;
|
|
270
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
|
271
|
+
if (has_minicpmv_projector == 2) {
|
|
272
|
+
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
|
273
|
+
}
|
|
274
|
+
else if (has_minicpmv_projector == 3) {
|
|
275
|
+
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
|
276
|
+
}
|
|
277
|
+
if (!encoded) {
|
|
278
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
279
|
+
return false;
|
|
280
|
+
}
|
|
281
|
+
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
|
282
|
+
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
|
283
|
+
}
|
|
284
|
+
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
285
|
+
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
286
|
+
|
|
287
|
+
int n_img_pos_out = 0;
|
|
288
|
+
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
289
|
+
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
|
290
|
+
n_img_pos_out += clip_n_patches(ctx_clip);
|
|
291
|
+
}
|
|
292
|
+
*n_img_pos = n_img_pos_out;
|
|
293
|
+
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
294
|
+
free(image_embd_v[i]);
|
|
295
|
+
}
|
|
296
|
+
image_embd_v.clear();
|
|
297
|
+
load_image_size->width = img->nx;
|
|
298
|
+
load_image_size->height = img->ny;
|
|
299
|
+
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
300
|
+
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
301
|
+
}
|
|
302
|
+
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
222
303
|
// flat / default llava-1.5 type embedding
|
|
223
304
|
*n_img_pos = clip_n_patches(ctx_clip);
|
|
224
305
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
|
225
306
|
delete[] img_res_v.data;
|
|
226
307
|
if (!encoded) {
|
|
227
|
-
|
|
308
|
+
LOG_ERR("Unable to encode image\n");
|
|
228
309
|
|
|
229
310
|
return false;
|
|
230
311
|
}
|
|
231
|
-
}
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
232
314
|
// spatial_unpad llava-1.6 type embedding
|
|
233
315
|
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
|
234
316
|
std::vector<float *> image_embd_v;
|
|
@@ -237,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
237
319
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
|
238
320
|
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
|
239
321
|
if (!encoded) {
|
|
240
|
-
|
|
322
|
+
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
241
323
|
return false;
|
|
242
324
|
}
|
|
243
325
|
}
|
|
244
326
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
245
|
-
|
|
327
|
+
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
246
328
|
|
|
247
329
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
248
330
|
|
|
@@ -275,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
275
357
|
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
|
276
358
|
}
|
|
277
359
|
|
|
278
|
-
|
|
360
|
+
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
|
279
361
|
|
|
280
362
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
|
281
363
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
282
364
|
|
|
283
|
-
|
|
365
|
+
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
|
284
366
|
|
|
285
367
|
return true;
|
|
286
368
|
}
|
|
@@ -290,22 +372,26 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|
|
290
372
|
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
291
373
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
|
292
374
|
if (n_image_embd != n_llama_embd) {
|
|
293
|
-
|
|
375
|
+
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
|
294
376
|
return false;
|
|
295
377
|
}
|
|
296
378
|
return true;
|
|
297
379
|
}
|
|
298
380
|
|
|
299
381
|
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
300
|
-
|
|
382
|
+
int num_max_patches = 6;
|
|
383
|
+
if (clip_is_minicpmv(ctx_clip)) {
|
|
384
|
+
num_max_patches = 10;
|
|
385
|
+
}
|
|
386
|
+
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
|
301
387
|
if (!image_embd) {
|
|
302
|
-
|
|
388
|
+
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
|
303
389
|
return false;
|
|
304
390
|
}
|
|
305
391
|
|
|
306
392
|
int n_img_pos;
|
|
307
393
|
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
|
308
|
-
|
|
394
|
+
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
|
309
395
|
free(image_embd);
|
|
310
396
|
return false;
|
|
311
397
|
}
|
|
@@ -325,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|
|
325
411
|
}
|
|
326
412
|
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
|
327
413
|
if (llama_decode(ctx_llama, batch)) {
|
|
328
|
-
|
|
414
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
329
415
|
return false;
|
|
330
416
|
}
|
|
331
417
|
*n_past += n_eval;
|
|
@@ -337,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
337
423
|
clip_image_u8 * img = clip_image_u8_init();
|
|
338
424
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
|
339
425
|
clip_image_u8_free(img);
|
|
340
|
-
|
|
426
|
+
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
|
341
427
|
return NULL;
|
|
342
428
|
}
|
|
343
429
|
|
|
@@ -346,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
346
432
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
|
347
433
|
if (!image_embed_result) {
|
|
348
434
|
clip_image_u8_free(img);
|
|
349
|
-
|
|
435
|
+
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
|
350
436
|
return NULL;
|
|
351
437
|
}
|
|
352
438
|
|
|
@@ -360,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|
|
360
446
|
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
|
361
447
|
auto file = fopen(path, "rb");
|
|
362
448
|
if (file == NULL) {
|
|
363
|
-
|
|
449
|
+
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
|
364
450
|
return false;
|
|
365
451
|
}
|
|
366
452
|
|
|
@@ -370,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
|
|
370
456
|
|
|
371
457
|
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
|
372
458
|
if (buffer == NULL) {
|
|
373
|
-
|
|
459
|
+
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
|
374
460
|
perror("Memory allocation error");
|
|
375
461
|
fclose(file);
|
|
376
462
|
return false;
|
|
@@ -395,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
|
|
395
481
|
long image_bytes_length;
|
|
396
482
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
|
397
483
|
if (!loaded) {
|
|
398
|
-
|
|
484
|
+
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
|
399
485
|
return NULL;
|
|
400
486
|
}
|
|
401
487
|
|
|
@@ -17,12 +17,11 @@
|
|
|
17
17
|
# define LLAVA_API
|
|
18
18
|
#endif
|
|
19
19
|
|
|
20
|
-
struct clip_ctx;
|
|
21
|
-
|
|
22
20
|
#ifdef __cplusplus
|
|
23
21
|
extern "C" {
|
|
24
22
|
#endif
|
|
25
23
|
|
|
24
|
+
struct clip_ctx;
|
|
26
25
|
struct llava_image_embed {
|
|
27
26
|
float * embed;
|
|
28
27
|
int n_image_pos;
|
|
@@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
|
|
|
37
36
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
|
38
37
|
/** build an image embed from a path to an image filename */
|
|
39
38
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
|
40
|
-
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
41
39
|
/** free an embedding made with llava_image_embed_make_* */
|
|
40
|
+
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
42
41
|
|
|
43
42
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
|
44
43
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|