@fugood/llama.node 0.3.17 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include <stddef.h>
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
|
|
7
|
+
struct clip_ctx;
|
|
8
|
+
|
|
9
|
+
struct clip_image_size {
|
|
10
|
+
int width;
|
|
11
|
+
int height;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
struct clip_image_f32;
|
|
15
|
+
struct clip_image_u8_batch;
|
|
16
|
+
struct clip_image_f32_batch;
|
|
17
|
+
|
|
18
|
+
struct clip_context_params {
|
|
19
|
+
bool use_gpu;
|
|
20
|
+
enum ggml_log_level verbosity;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
|
24
|
+
|
|
25
|
+
void clip_free(struct clip_ctx * ctx);
|
|
26
|
+
|
|
27
|
+
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
28
|
+
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
|
29
|
+
|
|
30
|
+
int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
|
31
|
+
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
|
32
|
+
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
|
33
|
+
|
|
34
|
+
// TODO: should be enum, not string
|
|
35
|
+
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
36
|
+
|
|
37
|
+
const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
38
|
+
size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
|
39
|
+
|
|
40
|
+
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
41
|
+
|
|
42
|
+
// for M-RoPE, this will be the number of token positions in X and Y directions
|
|
43
|
+
// for other models, X will be the total number of tokens and Y will be 1
|
|
44
|
+
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
45
|
+
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
46
|
+
|
|
47
|
+
// this should be equal to the embedding dimension of the text model
|
|
48
|
+
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
49
|
+
|
|
50
|
+
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
|
51
|
+
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
|
52
|
+
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
|
53
|
+
|
|
54
|
+
struct clip_image_size * clip_image_size_init(void);
|
|
55
|
+
struct clip_image_u8 * clip_image_u8_init (void);
|
|
56
|
+
struct clip_image_f32 * clip_image_f32_init(void);
|
|
57
|
+
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
|
58
|
+
|
|
59
|
+
// nx, ny are the output image dimensions
|
|
60
|
+
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
|
61
|
+
|
|
62
|
+
void clip_image_size_free (struct clip_image_size * img_size);
|
|
63
|
+
void clip_image_u8_free (struct clip_image_u8 * img);
|
|
64
|
+
void clip_image_f32_free(struct clip_image_f32 * img);
|
|
65
|
+
void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
66
|
+
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
|
67
|
+
|
|
68
|
+
// use for accessing underlay data of clip_image_f32_batch
|
|
69
|
+
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
|
70
|
+
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
|
71
|
+
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
|
72
|
+
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
|
76
|
+
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
|
77
|
+
*/
|
|
78
|
+
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
|
79
|
+
|
|
80
|
+
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
|
81
|
+
|
|
82
|
+
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
83
|
+
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
84
|
+
|
|
85
|
+
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
|
86
|
+
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
|
87
|
+
|
|
88
|
+
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
|
89
|
+
|
|
90
|
+
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
|
91
|
+
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
|
92
|
+
|
|
93
|
+
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
94
|
+
bool clip_is_glm(const struct clip_ctx * ctx);
|
|
95
|
+
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
96
|
+
bool clip_is_llava(const struct clip_ctx * ctx);
|
|
97
|
+
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
|
98
|
+
|
|
99
|
+
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
|
|
|
63
63
|
#endif
|
|
64
64
|
|
|
65
65
|
struct mtmd_cli_context {
|
|
66
|
-
|
|
66
|
+
mtmd::context_ptr ctx_vision;
|
|
67
67
|
common_init_result llama_init;
|
|
68
68
|
|
|
69
69
|
llama_model * model;
|
|
@@ -72,6 +72,8 @@ struct mtmd_cli_context {
|
|
|
72
72
|
llama_batch batch;
|
|
73
73
|
int n_batch;
|
|
74
74
|
|
|
75
|
+
mtmd::bitmaps bitmaps;
|
|
76
|
+
|
|
75
77
|
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
|
|
76
78
|
// so here we don't need to keep track of chat history
|
|
77
79
|
common_chat_templates_ptr tmpls;
|
|
@@ -90,10 +92,15 @@ struct mtmd_cli_context {
|
|
|
90
92
|
batch = llama_batch_init(params.n_batch, 0, 1);
|
|
91
93
|
n_batch = params.n_batch;
|
|
92
94
|
|
|
95
|
+
if (!model || !lctx) {
|
|
96
|
+
exit(1);
|
|
97
|
+
}
|
|
98
|
+
|
|
93
99
|
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
|
|
94
100
|
LOG_ERR("Model does not have chat template.\n");
|
|
95
101
|
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
|
|
96
102
|
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
|
|
103
|
+
LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
|
|
97
104
|
exit(1);
|
|
98
105
|
}
|
|
99
106
|
|
|
@@ -112,12 +119,12 @@ struct mtmd_cli_context {
|
|
|
112
119
|
|
|
113
120
|
void init_vision_context(common_params & params) {
|
|
114
121
|
const char * clip_path = params.mmproj.path.c_str();
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
122
|
+
mtmd_context_params mparams = mtmd_context_params_default();
|
|
123
|
+
mparams.use_gpu = params.mmproj_use_gpu;
|
|
124
|
+
mparams.print_timings = true;
|
|
125
|
+
mparams.n_threads = params.cpuparams.n_threads;
|
|
126
|
+
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
|
127
|
+
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
|
121
128
|
if (!ctx_vision.get()) {
|
|
122
129
|
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
|
123
130
|
exit(1);
|
|
@@ -134,13 +141,22 @@ struct mtmd_cli_context {
|
|
|
134
141
|
antiprompt_tokens.begin()
|
|
135
142
|
);
|
|
136
143
|
}
|
|
144
|
+
|
|
145
|
+
bool load_image(const std::string & fname) {
|
|
146
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
|
|
147
|
+
if (!bmp.ptr) {
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
bitmaps.entries.push_back(std::move(bmp));
|
|
151
|
+
return true;
|
|
152
|
+
}
|
|
137
153
|
};
|
|
138
154
|
|
|
139
155
|
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
|
140
156
|
llama_tokens generated_tokens;
|
|
141
157
|
for (int i = 0; i < n_predict; i++) {
|
|
142
158
|
if (i > n_predict || !g_is_generating || g_is_interrupted) {
|
|
143
|
-
|
|
159
|
+
LOG("\n");
|
|
144
160
|
break;
|
|
145
161
|
}
|
|
146
162
|
|
|
@@ -149,15 +165,15 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
|
|
|
149
165
|
common_sampler_accept(smpl, token_id, true);
|
|
150
166
|
|
|
151
167
|
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
|
|
152
|
-
|
|
168
|
+
LOG("\n");
|
|
153
169
|
break; // end of generation
|
|
154
170
|
}
|
|
155
171
|
|
|
156
|
-
|
|
172
|
+
LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
|
157
173
|
fflush(stdout);
|
|
158
174
|
|
|
159
175
|
if (g_is_interrupted) {
|
|
160
|
-
|
|
176
|
+
LOG("\n");
|
|
161
177
|
break;
|
|
162
178
|
}
|
|
163
179
|
|
|
@@ -172,9 +188,7 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
|
|
|
172
188
|
return 0;
|
|
173
189
|
}
|
|
174
190
|
|
|
175
|
-
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg,
|
|
176
|
-
std::vector<mtmd_bitmap> bitmaps;
|
|
177
|
-
|
|
191
|
+
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
|
|
178
192
|
common_chat_templates_inputs tmpl_inputs;
|
|
179
193
|
tmpl_inputs.messages = {msg};
|
|
180
194
|
tmpl_inputs.add_generation_prompt = true;
|
|
@@ -182,35 +196,43 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
|
|
|
182
196
|
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
|
|
183
197
|
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
|
|
184
198
|
|
|
185
|
-
for (auto & fname : images_fname) {
|
|
186
|
-
mtmd_bitmap bitmap;
|
|
187
|
-
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
|
|
188
|
-
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
|
189
|
-
return 2; // image not found
|
|
190
|
-
}
|
|
191
|
-
bitmaps.push_back(std::move(bitmap));
|
|
192
|
-
}
|
|
193
|
-
|
|
194
199
|
mtmd_input_text text;
|
|
195
|
-
text.text = formatted_chat.prompt;
|
|
200
|
+
text.text = formatted_chat.prompt.c_str();
|
|
196
201
|
text.add_special = add_bos;
|
|
197
202
|
text.parse_special = true;
|
|
198
|
-
mtmd_input_chunks chunks;
|
|
199
203
|
|
|
200
204
|
if (g_is_interrupted) return 0;
|
|
201
205
|
|
|
202
|
-
|
|
206
|
+
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
|
207
|
+
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
|
|
208
|
+
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
|
|
209
|
+
chunks.ptr.get(), // output
|
|
210
|
+
&text, // text
|
|
211
|
+
bitmaps_c_ptr.data(),
|
|
212
|
+
bitmaps_c_ptr.size());
|
|
203
213
|
if (res != 0) {
|
|
204
214
|
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
|
|
205
215
|
return 1;
|
|
206
216
|
}
|
|
207
217
|
|
|
208
|
-
|
|
218
|
+
ctx.bitmaps.entries.clear();
|
|
219
|
+
|
|
220
|
+
llama_pos new_n_past;
|
|
221
|
+
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
|
|
222
|
+
ctx.lctx, // lctx
|
|
223
|
+
chunks.ptr.get(), // chunks
|
|
224
|
+
ctx.n_past, // n_past
|
|
225
|
+
0, // seq_id
|
|
226
|
+
ctx.n_batch, // n_batch
|
|
227
|
+
true, // logits_last
|
|
228
|
+
&new_n_past)) {
|
|
209
229
|
LOG_ERR("Unable to eval prompt\n");
|
|
210
230
|
return 1;
|
|
211
231
|
}
|
|
212
232
|
|
|
213
|
-
ctx.n_past
|
|
233
|
+
ctx.n_past = new_n_past;
|
|
234
|
+
|
|
235
|
+
LOG("\n");
|
|
214
236
|
|
|
215
237
|
return 0;
|
|
216
238
|
}
|
|
@@ -234,14 +256,14 @@ int main(int argc, char ** argv) {
|
|
|
234
256
|
}
|
|
235
257
|
|
|
236
258
|
mtmd_cli_context ctx(params);
|
|
237
|
-
|
|
259
|
+
LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
|
|
238
260
|
|
|
239
261
|
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
|
240
262
|
|
|
241
263
|
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
|
242
264
|
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
|
243
265
|
|
|
244
|
-
//
|
|
266
|
+
// Ctrl+C handling
|
|
245
267
|
{
|
|
246
268
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
247
269
|
struct sigaction sigint_action;
|
|
@@ -267,7 +289,12 @@ int main(int argc, char ** argv) {
|
|
|
267
289
|
common_chat_msg msg;
|
|
268
290
|
msg.role = "user";
|
|
269
291
|
msg.content = params.prompt;
|
|
270
|
-
|
|
292
|
+
for (const auto & image : params.image) {
|
|
293
|
+
if (!ctx.load_image(image)) {
|
|
294
|
+
return 1; // error is already printed by libmtmd
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
if (eval_message(ctx, msg, true)) {
|
|
271
298
|
return 1;
|
|
272
299
|
}
|
|
273
300
|
if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
|
|
@@ -282,7 +309,6 @@ int main(int argc, char ** argv) {
|
|
|
282
309
|
LOG("\n");
|
|
283
310
|
|
|
284
311
|
bool is_first_msg = true;
|
|
285
|
-
std::vector<std::string> images_fname;
|
|
286
312
|
std::string content;
|
|
287
313
|
|
|
288
314
|
while (!g_is_interrupted) {
|
|
@@ -307,10 +333,17 @@ int main(int argc, char ** argv) {
|
|
|
307
333
|
continue;
|
|
308
334
|
}
|
|
309
335
|
g_is_generating = true;
|
|
310
|
-
if (line.find("/image") == 0) {
|
|
336
|
+
if (line == "/image" || line.find("/image ") == 0) {
|
|
337
|
+
if (line.size() < 8) {
|
|
338
|
+
LOG_ERR("ERR: Missing image filename\n");
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
311
341
|
std::string image = line.substr(7);
|
|
312
|
-
|
|
313
|
-
|
|
342
|
+
if (ctx.load_image(image)) {
|
|
343
|
+
LOG("Image %s loaded\n", image.c_str());
|
|
344
|
+
content += "<__image__>";
|
|
345
|
+
}
|
|
346
|
+
// else, error is already printed by libmtmd
|
|
314
347
|
continue;
|
|
315
348
|
} else {
|
|
316
349
|
content += line;
|
|
@@ -318,21 +351,14 @@ int main(int argc, char ** argv) {
|
|
|
318
351
|
common_chat_msg msg;
|
|
319
352
|
msg.role = "user";
|
|
320
353
|
msg.content = content;
|
|
321
|
-
int ret = eval_message(ctx, msg,
|
|
322
|
-
if (g_is_interrupted) break;
|
|
323
|
-
if (ret == 2) {
|
|
324
|
-
// non-fatal error
|
|
325
|
-
images_fname.clear();
|
|
326
|
-
content.clear();
|
|
327
|
-
continue;
|
|
328
|
-
}
|
|
354
|
+
int ret = eval_message(ctx, msg, is_first_msg);
|
|
329
355
|
if (ret) {
|
|
330
356
|
return 1;
|
|
331
357
|
}
|
|
358
|
+
if (g_is_interrupted) break;
|
|
332
359
|
if (generate_response(ctx, smpl, n_predict)) {
|
|
333
360
|
return 1;
|
|
334
361
|
}
|
|
335
|
-
images_fname.clear();
|
|
336
362
|
content.clear();
|
|
337
363
|
is_first_msg = false;
|
|
338
364
|
}
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#include "mtmd.h"
|
|
2
|
+
#include "llama.h"
|
|
3
|
+
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <cinttypes>
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
|
9
|
+
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
|
10
|
+
|
|
11
|
+
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
|
12
|
+
size_t n_tokens = 0;
|
|
13
|
+
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
|
14
|
+
auto chunk = mtmd_input_chunks_get(chunks, i);
|
|
15
|
+
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
16
|
+
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
17
|
+
size_t n_tokens_text;
|
|
18
|
+
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
|
19
|
+
n_tokens += n_tokens_text;
|
|
20
|
+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
21
|
+
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
|
22
|
+
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
|
|
23
|
+
} else {
|
|
24
|
+
GGML_ASSERT(false && "chunk type not supported");
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return n_tokens;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
|
31
|
+
llama_pos n_pos = 0;
|
|
32
|
+
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
|
33
|
+
auto chunk = mtmd_input_chunks_get(chunks, i);
|
|
34
|
+
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
35
|
+
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
36
|
+
size_t n_tokens_text;
|
|
37
|
+
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
|
38
|
+
n_pos += n_tokens_text;
|
|
39
|
+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
40
|
+
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
|
41
|
+
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
|
|
42
|
+
} else {
|
|
43
|
+
GGML_ASSERT(false && "chunk type not supported");
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return n_pos;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// helper struct to make working with embd batch easier
|
|
50
|
+
// note: this will be removed after llama_batch_ext refactoring
|
|
51
|
+
struct decode_embd_batch {
|
|
52
|
+
int n_pos_per_embd;
|
|
53
|
+
int n_mmproj_embd;
|
|
54
|
+
std::vector<llama_pos> pos;
|
|
55
|
+
std::vector<llama_pos> pos_view; // used by mrope
|
|
56
|
+
std::vector<int32_t> n_seq_id;
|
|
57
|
+
std::vector<llama_seq_id> seq_id_0;
|
|
58
|
+
std::vector<llama_seq_id *> seq_ids;
|
|
59
|
+
std::vector<int8_t> logits;
|
|
60
|
+
llama_batch batch;
|
|
61
|
+
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
|
62
|
+
pos .resize(n_tokens * n_pos_per_embd);
|
|
63
|
+
n_seq_id.resize(n_tokens);
|
|
64
|
+
seq_ids .resize(n_tokens + 1);
|
|
65
|
+
logits .resize(n_tokens);
|
|
66
|
+
seq_id_0.resize(1);
|
|
67
|
+
seq_ids [n_tokens] = nullptr;
|
|
68
|
+
batch = {
|
|
69
|
+
/*n_tokens =*/ n_tokens,
|
|
70
|
+
/*tokens =*/ nullptr,
|
|
71
|
+
/*embd =*/ embd,
|
|
72
|
+
/*pos =*/ pos.data(),
|
|
73
|
+
/*n_seq_id =*/ n_seq_id.data(),
|
|
74
|
+
/*seq_id =*/ seq_ids.data(),
|
|
75
|
+
/*logits =*/ logits.data(),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
|
80
|
+
seq_id_0[0] = seq_id;
|
|
81
|
+
for (int i = 0; i < batch.n_tokens; i++) {
|
|
82
|
+
batch.pos [i] = pos_0 + i;
|
|
83
|
+
batch.n_seq_id[i] = 1;
|
|
84
|
+
batch.seq_id [i] = seq_id_0.data();
|
|
85
|
+
batch.logits [i] = false;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
|
90
|
+
GGML_ASSERT(n_pos_per_embd == 4);
|
|
91
|
+
seq_id_0[0] = seq_id;
|
|
92
|
+
for (int y = 0; y < ny; y++) {
|
|
93
|
+
for (int x = 0; x < nx; x++) {
|
|
94
|
+
int i = y * nx + x;
|
|
95
|
+
pos[i ] = pos_0;
|
|
96
|
+
pos[i + batch.n_tokens ] = pos_0 + y;
|
|
97
|
+
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
|
98
|
+
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
for (int i = 0; i < batch.n_tokens; i++) {
|
|
102
|
+
batch.n_seq_id[i] = 1;
|
|
103
|
+
batch.seq_id [i] = seq_id_0.data();
|
|
104
|
+
batch.logits [i] = false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
llama_batch get_view(int offset, int n_tokens) {
|
|
109
|
+
llama_pos * pos_ptr;
|
|
110
|
+
pos_view.clear();
|
|
111
|
+
pos_view.reserve(n_tokens * n_pos_per_embd);
|
|
112
|
+
if (n_pos_per_embd > 1) {
|
|
113
|
+
// mrope
|
|
114
|
+
// for example, with layout of src: 1234...1234...1234...1234...
|
|
115
|
+
// offset 2 will give us dst: 34...34...34...34...
|
|
116
|
+
for (int i = 0; i < n_pos_per_embd; i++) {
|
|
117
|
+
// assume n_tokens is less than or equal to batch.n_tokens
|
|
118
|
+
// batch.n_tokens is number of **total** tokens
|
|
119
|
+
// n_tokens is number of viewed token
|
|
120
|
+
size_t src_idx = i * batch.n_tokens + offset;
|
|
121
|
+
pos_view.insert(pos_view.end(),
|
|
122
|
+
pos.data() + src_idx,
|
|
123
|
+
pos.data() + src_idx + n_tokens);
|
|
124
|
+
}
|
|
125
|
+
pos_ptr = pos_view.data();
|
|
126
|
+
} else {
|
|
127
|
+
// normal
|
|
128
|
+
pos_ptr = pos.data() + offset;
|
|
129
|
+
}
|
|
130
|
+
return {
|
|
131
|
+
/*n_tokens =*/ n_tokens,
|
|
132
|
+
/*tokens =*/ nullptr,
|
|
133
|
+
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
|
134
|
+
/*pos =*/ pos_ptr,
|
|
135
|
+
/*n_seq_id =*/ batch.n_seq_id + offset,
|
|
136
|
+
/*seq_id =*/ batch.seq_id + offset,
|
|
137
|
+
/*logits =*/ batch.logits + offset,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
// Helper function for decoding an image whose embeddings have already been calculated
|
|
143
|
+
int32_t mtmd_helper_decode_image_chunk(
|
|
144
|
+
mtmd_context * ctx,
|
|
145
|
+
struct llama_context * lctx,
|
|
146
|
+
const mtmd_input_chunk * chunk,
|
|
147
|
+
float * encoded_embd,
|
|
148
|
+
llama_pos n_past,
|
|
149
|
+
llama_seq_id seq_id,
|
|
150
|
+
int32_t n_batch,
|
|
151
|
+
llama_pos * new_n_past) {
|
|
152
|
+
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
153
|
+
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
|
|
154
|
+
return -1;
|
|
155
|
+
}
|
|
156
|
+
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
157
|
+
if (!image_tokens) {
|
|
158
|
+
LOG_ERR("failed to decode image chunk: image tokens are null\n");
|
|
159
|
+
return -1;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const llama_model * model = llama_get_model(lctx);
|
|
163
|
+
int n_mmproj_embd = llama_model_n_embd(model);
|
|
164
|
+
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
|
165
|
+
|
|
166
|
+
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
|
167
|
+
int32_t i_batch = 0;
|
|
168
|
+
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
|
169
|
+
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
|
170
|
+
|
|
171
|
+
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
|
172
|
+
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
|
173
|
+
|
|
174
|
+
if (mtmd_decode_use_mrope(ctx)) {
|
|
175
|
+
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
|
176
|
+
} else {
|
|
177
|
+
batch_embd.set_position_normal(n_past, seq_id);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (mtmd_decode_use_non_causal(ctx)) {
|
|
181
|
+
llama_set_causal_attn(lctx, false);
|
|
182
|
+
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
while (i_batch < n_img_batches) { // split into batches
|
|
186
|
+
int pos_offset = i_batch*n_batch;
|
|
187
|
+
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
|
188
|
+
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
|
189
|
+
|
|
190
|
+
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
|
191
|
+
|
|
192
|
+
int64_t t1 = ggml_time_ms();
|
|
193
|
+
int32_t ret = llama_decode(lctx, batch_embd_view);
|
|
194
|
+
if (ret != 0) {
|
|
195
|
+
LOG_ERR("failed to decode image\n");
|
|
196
|
+
llama_set_causal_attn(lctx, true); // restore causal attn
|
|
197
|
+
return ret;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
|
201
|
+
|
|
202
|
+
i_batch++;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
|
|
206
|
+
*new_n_past = n_past;
|
|
207
|
+
|
|
208
|
+
if (mtmd_decode_use_non_causal(ctx)) {
|
|
209
|
+
llama_set_causal_attn(lctx, true);
|
|
210
|
+
}
|
|
211
|
+
return 0;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
|
215
|
+
struct llama_context * lctx,
|
|
216
|
+
const mtmd_input_chunk * chunk,
|
|
217
|
+
llama_pos n_past,
|
|
218
|
+
llama_seq_id seq_id,
|
|
219
|
+
int32_t n_batch,
|
|
220
|
+
bool logits_last,
|
|
221
|
+
llama_pos * new_n_past) {
|
|
222
|
+
int32_t ret;
|
|
223
|
+
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
|
224
|
+
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
225
|
+
|
|
226
|
+
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
227
|
+
size_t n_tokens;
|
|
228
|
+
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
229
|
+
// LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
|
|
230
|
+
size_t i = 0;
|
|
231
|
+
while (i < n_tokens) { // split into batches
|
|
232
|
+
text_batch.n_tokens = 0; // clear the batch
|
|
233
|
+
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
|
234
|
+
text_batch.n_tokens++;
|
|
235
|
+
text_batch.token [i] = tokens[i];
|
|
236
|
+
text_batch.pos [i] = n_past++;
|
|
237
|
+
text_batch.n_seq_id[i] = 1;
|
|
238
|
+
text_batch.seq_id [i][0] = seq_id;
|
|
239
|
+
text_batch.logits [i] = false;
|
|
240
|
+
}
|
|
241
|
+
bool is_last_token = (i == n_tokens);
|
|
242
|
+
if (logits_last && is_last_token) {
|
|
243
|
+
text_batch.logits[text_batch.n_tokens - 1] = true;
|
|
244
|
+
}
|
|
245
|
+
ret = llama_decode(lctx, text_batch);
|
|
246
|
+
if (ret != 0) {
|
|
247
|
+
LOG_ERR("failed to decode text\n");
|
|
248
|
+
llama_batch_free(text_batch);
|
|
249
|
+
return ret;
|
|
250
|
+
}
|
|
251
|
+
*new_n_past += text_batch.n_tokens;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
255
|
+
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
256
|
+
int64_t t0 = ggml_time_ms();
|
|
257
|
+
|
|
258
|
+
LOG_INF("encoding image or slice...\n");
|
|
259
|
+
|
|
260
|
+
ret = mtmd_encode(ctx, image_tokens);
|
|
261
|
+
if (ret != 0) {
|
|
262
|
+
LOG_ERR("failed to encode image\n");
|
|
263
|
+
llama_batch_free(text_batch);
|
|
264
|
+
return ret;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
268
|
+
|
|
269
|
+
float * embd = mtmd_get_output_embd(ctx);
|
|
270
|
+
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
|
271
|
+
if (ret != 0) {
|
|
272
|
+
LOG_ERR("failed to decode image\n");
|
|
273
|
+
llama_batch_free(text_batch);
|
|
274
|
+
return ret;
|
|
275
|
+
}
|
|
276
|
+
} else {
|
|
277
|
+
GGML_ABORT("chunk type not supported");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return 0;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
|
284
|
+
struct llama_context * lctx,
|
|
285
|
+
const mtmd_input_chunks * chunks,
|
|
286
|
+
llama_pos n_past,
|
|
287
|
+
llama_seq_id seq_id,
|
|
288
|
+
int32_t n_batch,
|
|
289
|
+
bool logits_last,
|
|
290
|
+
llama_pos * new_n_past) {
|
|
291
|
+
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
|
292
|
+
if (n_chunks == 0) {
|
|
293
|
+
LOG_ERR("no chunks to eval\n");
|
|
294
|
+
return 0;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
for (size_t i = 0; i < n_chunks; i++) {
|
|
298
|
+
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
|
|
299
|
+
auto chunk = mtmd_input_chunks_get(chunks, i);
|
|
300
|
+
|
|
301
|
+
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
|
|
302
|
+
if (res != 0) {
|
|
303
|
+
LOG_ERR("failed to eval chunk %zu\n", i);
|
|
304
|
+
return res;
|
|
305
|
+
}
|
|
306
|
+
*new_n_past = n_past;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
return 0;
|
|
310
|
+
}
|