@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#ifndef MTMD_H
|
|
2
|
+
#define MTMD_H
|
|
3
|
+
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
#include "clip.h"
|
|
7
|
+
|
|
8
|
+
#include <vector>
|
|
9
|
+
#include <cinttypes>
|
|
10
|
+
#include <memory>
|
|
11
|
+
|
|
12
|
+
#ifdef LLAMA_SHARED
|
|
13
|
+
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
14
|
+
# ifdef LLAMA_BUILD
|
|
15
|
+
# define MTMD_API __declspec(dllexport)
|
|
16
|
+
# else
|
|
17
|
+
# define MTMD_API __declspec(dllimport)
|
|
18
|
+
# endif
|
|
19
|
+
# else
|
|
20
|
+
# define MTMD_API __attribute__ ((visibility ("default")))
|
|
21
|
+
# endif
|
|
22
|
+
#else
|
|
23
|
+
# define MTMD_API
|
|
24
|
+
#endif
|
|
25
|
+
|
|
26
|
+
#ifdef __cplusplus
|
|
27
|
+
|
|
28
|
+
enum mtmd_input_chunk_type {
|
|
29
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
30
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
struct mtmd_context;
|
|
34
|
+
struct mtmd_image_tokens;
|
|
35
|
+
|
|
36
|
+
// represents raw image data, layout is RGBRGBRGB...
|
|
37
|
+
// length of data must be nx * ny * 3
|
|
38
|
+
struct mtmd_bitmap {
|
|
39
|
+
uint32_t nx;
|
|
40
|
+
uint32_t ny;
|
|
41
|
+
std::vector<unsigned char> data;
|
|
42
|
+
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
struct mtmd_image_tokens_deleter {
|
|
46
|
+
void operator()(mtmd_image_tokens * val); // forward declaration
|
|
47
|
+
};
|
|
48
|
+
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
|
49
|
+
|
|
50
|
+
struct mtmd_input_chunk {
|
|
51
|
+
mtmd_input_chunk_type type;
|
|
52
|
+
std::vector<llama_token> tokens_text;
|
|
53
|
+
mtmd_image_tokens_ptr tokens_image;
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
|
57
|
+
|
|
58
|
+
struct mtmd_context_params {
|
|
59
|
+
bool use_gpu = true;
|
|
60
|
+
bool print_timings = true;
|
|
61
|
+
int n_threads = 4;
|
|
62
|
+
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
|
|
63
|
+
const char * image_marker = "<__image__>";
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
struct mtmd_input_text {
|
|
67
|
+
std::string text;
|
|
68
|
+
bool add_special;
|
|
69
|
+
bool parse_special;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// initialize the mtmd context
|
|
73
|
+
// return nullptr on failure
|
|
74
|
+
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
75
|
+
const llama_model * text_model,
|
|
76
|
+
const mtmd_context_params ctx_params);
|
|
77
|
+
|
|
78
|
+
MTMD_API void mtmd_free(mtmd_context * ctx);
|
|
79
|
+
|
|
80
|
+
// tokenize an input text prompt and an image
|
|
81
|
+
// the prompt must have the input image marker (default: "<__image__>") in it
|
|
82
|
+
// the marker will be replaced with the image tokens
|
|
83
|
+
// for example:
|
|
84
|
+
// "here is an image: <__image__>\ndescribe it in detail."
|
|
85
|
+
// this will gives 3 chunks:
|
|
86
|
+
// 1. "here is an image: <start_of_image>"
|
|
87
|
+
// 2. (image tokens)
|
|
88
|
+
// 3. "<end_of_image>\ndescribe it in detail."
|
|
89
|
+
// number of bitmaps must be equal to the number of image markers in the prompt
|
|
90
|
+
// this function is thread-safe (shared ctx)
|
|
91
|
+
// return values:
|
|
92
|
+
// 0 on success
|
|
93
|
+
// 1 on number of images not matching the number of markers
|
|
94
|
+
// 2 on image preprocessing error
|
|
95
|
+
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
96
|
+
std::vector<mtmd_input_chunk> & output,
|
|
97
|
+
const mtmd_input_text & text,
|
|
98
|
+
const std::vector<mtmd_bitmap> & bitmaps);
|
|
99
|
+
|
|
100
|
+
// access mtmd_image_tokens
|
|
101
|
+
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
|
102
|
+
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
|
103
|
+
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
|
104
|
+
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
|
105
|
+
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
|
106
|
+
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
|
107
|
+
|
|
108
|
+
// returns 0 on success
|
|
109
|
+
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
|
110
|
+
const mtmd_image_tokens * image_tokens);
|
|
111
|
+
|
|
112
|
+
// get output embeddings from the last encode pass
|
|
113
|
+
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|
114
|
+
|
|
115
|
+
// whether we need to set non-causal mask before llama_decode
|
|
116
|
+
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
|
117
|
+
|
|
118
|
+
// whether the current model use M-RoPE for llama_decode
|
|
119
|
+
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
//
|
|
124
|
+
// helper functions (can be implemented based on other functions)
|
|
125
|
+
//
|
|
126
|
+
|
|
127
|
+
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
|
128
|
+
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
|
129
|
+
|
|
130
|
+
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
|
131
|
+
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
|
|
132
|
+
|
|
133
|
+
// helper function that automatically:
|
|
134
|
+
// 1. run llama_decode() on text chunks
|
|
135
|
+
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
|
136
|
+
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
|
137
|
+
// otherwise, returns 0 on success
|
|
138
|
+
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
|
139
|
+
llama_context * lctx,
|
|
140
|
+
mtmd_input_chunks & chunks,
|
|
141
|
+
llama_pos pos0,
|
|
142
|
+
llama_seq_id seq_id,
|
|
143
|
+
int32_t n_batch);
|
|
144
|
+
|
|
145
|
+
// helper function to construct a mtmd_bitmap from a file
|
|
146
|
+
// returns 0 on success
|
|
147
|
+
// this function is thread-safe
|
|
148
|
+
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
|
|
149
|
+
|
|
150
|
+
// helper function to construct a mtmd_bitmap from a buffer
|
|
151
|
+
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
|
|
152
|
+
// returns 0 on success
|
|
153
|
+
// this function is thread-safe
|
|
154
|
+
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
|
|
155
|
+
|
|
156
|
+
// convenient unique_ptr wrappers
|
|
157
|
+
struct mtmd_context_deleter {
|
|
158
|
+
void operator()(mtmd_context * val) { mtmd_free(val); }
|
|
159
|
+
};
|
|
160
|
+
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
|
161
|
+
|
|
162
|
+
#else
|
|
163
|
+
|
|
164
|
+
static_assert(false && "C header is not yet supported by this library");
|
|
165
|
+
|
|
166
|
+
#endif
|
|
167
|
+
|
|
168
|
+
#endif
|
|
@@ -23,7 +23,12 @@
|
|
|
23
23
|
#include <algorithm>
|
|
24
24
|
#include <iostream>
|
|
25
25
|
#include <fstream>
|
|
26
|
+
#include <limits>
|
|
27
|
+
#include <cassert>
|
|
28
|
+
#include <cmath>
|
|
26
29
|
|
|
30
|
+
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
|
|
31
|
+
// IT IS NOT A PRODUCTION CODE
|
|
27
32
|
|
|
28
33
|
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
|
29
34
|
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
|
@@ -89,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
|
|
|
89
94
|
|
|
90
95
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
|
91
96
|
int N = (int) tokens.size();
|
|
92
|
-
std::vector<llama_pos> pos;
|
|
93
97
|
for (int i = 0; i < N; i += n_batch) {
|
|
94
98
|
int n_eval = (int) tokens.size() - i;
|
|
95
99
|
if (n_eval > n_batch) {
|
|
96
100
|
n_eval = n_batch;
|
|
97
101
|
}
|
|
98
102
|
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
|
99
|
-
// TODO: add mrope pos ids somewhere else
|
|
100
|
-
pos.resize(batch.n_tokens * 4);
|
|
101
|
-
std::fill(pos.begin(), pos.end(), 0);
|
|
102
|
-
for (int j = 0; j < batch.n_tokens * 3; j ++) {
|
|
103
|
-
pos[j] = *st_pos_id + (j % batch.n_tokens);
|
|
104
|
-
}
|
|
105
|
-
batch.pos = pos.data();
|
|
106
103
|
|
|
107
104
|
if (llama_decode(ctx_llama, batch)) {
|
|
108
105
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
@@ -314,7 +311,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|
|
314
311
|
|
|
315
312
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
316
313
|
|
|
317
|
-
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
|
314
|
+
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
|
318
315
|
if (model == NULL) {
|
|
319
316
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
320
317
|
return NULL;
|
|
@@ -323,14 +320,14 @@ static struct llama_model * llava_init(common_params * params) {
|
|
|
323
320
|
}
|
|
324
321
|
|
|
325
322
|
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
|
326
|
-
const char * clip_path = params->mmproj.c_str();
|
|
323
|
+
const char * clip_path = params->mmproj.path.c_str();
|
|
327
324
|
|
|
328
325
|
auto prompt = params->prompt;
|
|
329
326
|
if (prompt.empty()) {
|
|
330
327
|
prompt = "describe the image in detail.";
|
|
331
328
|
}
|
|
332
329
|
|
|
333
|
-
auto ctx_clip = clip_model_load(clip_path,
|
|
330
|
+
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
|
334
331
|
|
|
335
332
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
336
333
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
@@ -367,14 +364,14 @@ static void debug_test_mrope_2d() {
|
|
|
367
364
|
// 1. Initialize backend
|
|
368
365
|
ggml_backend_t backend = NULL;
|
|
369
366
|
std::string backend_name = "";
|
|
370
|
-
#ifdef GGML_USE_CUDA
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
#endif
|
|
367
|
+
// #ifdef GGML_USE_CUDA
|
|
368
|
+
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
369
|
+
// backend = ggml_backend_cuda_init(0); // init device 0
|
|
370
|
+
// backend_name = "cuda";
|
|
371
|
+
// if (!backend) {
|
|
372
|
+
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
373
|
+
// }
|
|
374
|
+
// #endif
|
|
378
375
|
// if there aren't GPU Backends fallback to CPU backend
|
|
379
376
|
if (!backend) {
|
|
380
377
|
backend = ggml_backend_cpu_init();
|
|
@@ -483,28 +480,82 @@ static void debug_test_mrope_2d() {
|
|
|
483
480
|
ggml_backend_free(backend);
|
|
484
481
|
}
|
|
485
482
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
483
|
+
enum model_output_type {
|
|
484
|
+
conv3d,
|
|
485
|
+
patch_embed,
|
|
486
|
+
patch_win_attn_scatter,
|
|
487
|
+
first_attn_layer,
|
|
488
|
+
last_attn_layer,
|
|
489
|
+
attn_softmax,
|
|
490
|
+
final_layer,
|
|
491
|
+
};
|
|
492
|
+
|
|
493
|
+
static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
|
|
494
|
+
constexpr int ih = 140;
|
|
495
|
+
constexpr int iw = 196;
|
|
496
|
+
// constexpr int ih = 56;
|
|
497
|
+
// constexpr int iw = 56;
|
|
498
|
+
// int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
|
499
|
+
int n_embd = 1280;
|
|
500
|
+
int merge = 1;
|
|
501
|
+
if (output_type == model_output_type::final_layer) {
|
|
502
|
+
n_embd = 2048;
|
|
503
|
+
merge = 2;
|
|
504
|
+
}
|
|
505
|
+
else if (output_type == model_output_type::attn_softmax) {
|
|
506
|
+
merge = 1;
|
|
507
|
+
n_embd = (ih/14/merge) * (iw/14/merge) * 16;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
|
|
511
|
+
float vals[iw * ih * 3];
|
|
490
512
|
// float embd[ne];
|
|
491
513
|
std::vector<float> embd;
|
|
492
514
|
embd.resize(ne);
|
|
493
515
|
|
|
494
|
-
for (int i = 0; i <
|
|
516
|
+
for (int i = 0; i < iw*ih; i++)
|
|
495
517
|
{
|
|
496
518
|
for (int c = 0; c < 3; c++)
|
|
497
|
-
vals[i * 3 + c] = (float)
|
|
519
|
+
vals[i * 3 + c] = (float)i / (iw*ih);
|
|
498
520
|
}
|
|
499
521
|
|
|
500
|
-
clip_encode_float_image(ctx_llava->ctx_clip,
|
|
522
|
+
clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
|
|
523
|
+
|
|
524
|
+
std::string file_postfix = "";
|
|
525
|
+
switch (output_type)
|
|
526
|
+
{
|
|
527
|
+
case model_output_type::conv3d:
|
|
528
|
+
file_postfix = "conv3d";
|
|
529
|
+
break;
|
|
530
|
+
case model_output_type::patch_embed:
|
|
531
|
+
file_postfix = "patch_embed";
|
|
532
|
+
break;
|
|
533
|
+
case model_output_type::patch_win_attn_scatter:
|
|
534
|
+
file_postfix = "scatter";
|
|
535
|
+
break;
|
|
536
|
+
case model_output_type::first_attn_layer:
|
|
537
|
+
file_postfix = "first_attn";
|
|
538
|
+
break;
|
|
539
|
+
case model_output_type::last_attn_layer:
|
|
540
|
+
file_postfix = "last_attn";
|
|
541
|
+
break;
|
|
542
|
+
case model_output_type::attn_softmax:
|
|
543
|
+
file_postfix = "attn_softmax";
|
|
544
|
+
break;
|
|
545
|
+
case model_output_type::final_layer:
|
|
546
|
+
file_postfix = "final";
|
|
547
|
+
break;
|
|
548
|
+
default:
|
|
549
|
+
break;
|
|
550
|
+
}
|
|
551
|
+
auto output_path = "img_embed_" + file_postfix + ".bin";
|
|
501
552
|
|
|
502
|
-
std::ofstream outFile(
|
|
553
|
+
std::ofstream outFile(output_path, std::ios::binary);
|
|
503
554
|
if (outFile.is_open()) {
|
|
504
555
|
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
|
|
505
556
|
|
|
506
557
|
outFile.close();
|
|
507
|
-
std::cout << "Data successfully written to
|
|
558
|
+
std::cout << "Data successfully written to ::[ " << output_path << std::endl;
|
|
508
559
|
} else {
|
|
509
560
|
std::cerr << "Error opening file!" << std::endl;
|
|
510
561
|
}
|
|
@@ -524,7 +575,7 @@ int main(int argc, char ** argv) {
|
|
|
524
575
|
|
|
525
576
|
common_init();
|
|
526
577
|
|
|
527
|
-
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
578
|
+
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
528
579
|
print_usage(argc, argv);
|
|
529
580
|
return 1;
|
|
530
581
|
}
|
|
@@ -551,8 +602,9 @@ int main(int argc, char ** argv) {
|
|
|
551
602
|
} else if (params.image[0].empty()) {
|
|
552
603
|
auto ctx_llava = llava_init_context(¶ms, model);
|
|
553
604
|
|
|
554
|
-
debug_test_mrope_2d();
|
|
555
|
-
debug_dump_img_embed(ctx_llava);
|
|
605
|
+
// debug_test_mrope_2d();
|
|
606
|
+
debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
|
|
607
|
+
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
|
|
556
608
|
|
|
557
609
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
558
610
|
ctx_llava->model = NULL;
|
|
@@ -865,9 +865,22 @@ int main(int argc, char ** argv) {
|
|
|
865
865
|
console::set_display(console::reset);
|
|
866
866
|
display = true;
|
|
867
867
|
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
868
|
+
if (buffer.empty()) { // Ctrl+D on empty line exits
|
|
869
|
+
LOG("EOF by user\n");
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
if (buffer.back() == '\n') {
|
|
874
|
+
// Implement #587:
|
|
875
|
+
// If the user wants the text to end in a newline,
|
|
876
|
+
// this should be accomplished by explicitly adding a newline by using \ followed by return,
|
|
877
|
+
// then returning control by pressing return again.
|
|
878
|
+
buffer.pop_back();
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
if (buffer.empty()) { // Enter key on empty line lets the user pass control back
|
|
882
|
+
LOG_DBG("empty line, passing control back\n");
|
|
883
|
+
} else { // Add tokens to embd only if the input buffer is non-empty
|
|
871
884
|
// append input suffix if any
|
|
872
885
|
if (!params.input_suffix.empty() && !params.conversation_mode) {
|
|
873
886
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
@@ -915,8 +928,6 @@ int main(int argc, char ** argv) {
|
|
|
915
928
|
|
|
916
929
|
n_remain -= line_inp.size();
|
|
917
930
|
LOG_DBG("n_remain: %d\n", n_remain);
|
|
918
|
-
} else {
|
|
919
|
-
LOG_DBG("empty line, passing control back\n");
|
|
920
931
|
}
|
|
921
932
|
|
|
922
933
|
input_echo = false; // do not echo this again
|
|
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
|
|
|
106
106
|
|
|
107
107
|
common_params params;
|
|
108
108
|
|
|
109
|
+
params.n_predict = 128;
|
|
110
|
+
|
|
109
111
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
|
110
112
|
return 1;
|
|
111
113
|
}
|
|
@@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
|
|
|
405
407
|
params.prompt_file = "used built-in defaults";
|
|
406
408
|
}
|
|
407
409
|
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
|
408
|
-
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
|
410
|
+
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
|
|
409
411
|
|
|
410
412
|
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
|
411
413
|
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
|
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
|
|
|
64
64
|
|
|
65
65
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
66
66
|
|
|
67
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
67
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
|
68
68
|
|
|
69
69
|
if (model == NULL) {
|
|
70
70
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
@@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
851
851
|
|
|
852
852
|
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
|
853
853
|
|
|
854
|
-
LOG("\ntask\tacc_norm\n");
|
|
854
|
+
LOG("\ntask\tacc_norm\t95%% confidence interval\n");
|
|
855
855
|
|
|
856
856
|
double acc = 0.0f;
|
|
857
857
|
|
|
@@ -985,8 +985,22 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
985
985
|
acc += 1.0;
|
|
986
986
|
}
|
|
987
987
|
|
|
988
|
-
|
|
989
|
-
|
|
988
|
+
double freq = acc / double(i + 1);
|
|
989
|
+
|
|
990
|
+
const double za = 1.95996398454;
|
|
991
|
+
|
|
992
|
+
// // Wald normal approx
|
|
993
|
+
// double conf =za*sqrt(freq*(1-freq)/double(i + 1));
|
|
994
|
+
// LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
|
|
995
|
+
|
|
996
|
+
// Wilson score interval, more accurate
|
|
997
|
+
double z = za * za / double(i + 1);
|
|
998
|
+
double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
|
|
999
|
+
double a = (freq + z * 0.5 - cnf) / (1.0 + z);
|
|
1000
|
+
double b = (freq + z * 0.5 + cnf) / (1.0 + z);
|
|
1001
|
+
|
|
1002
|
+
// Print the accumulated accuracy mean x 100 and confidence interval
|
|
1003
|
+
LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
|
|
990
1004
|
}
|
|
991
1005
|
|
|
992
1006
|
i0 = i1 - 1;
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <fstream>
|
|
10
10
|
#include <cmath>
|
|
11
11
|
#include <cctype>
|
|
12
|
+
#include <algorithm>
|
|
12
13
|
|
|
13
14
|
struct quant_option {
|
|
14
15
|
std::string name;
|
|
@@ -16,7 +17,7 @@ struct quant_option {
|
|
|
16
17
|
std::string desc;
|
|
17
18
|
};
|
|
18
19
|
|
|
19
|
-
static const std::vector<
|
|
20
|
+
static const std::vector<quant_option> QUANT_OPTIONS = {
|
|
20
21
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
21
22
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
|
22
23
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
|
@@ -105,7 +106,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|
|
105
106
|
//
|
|
106
107
|
[[noreturn]]
|
|
107
108
|
static void usage(const char * executable) {
|
|
108
|
-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]
|
|
109
|
+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
|
|
110
|
+
printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
|
109
111
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
|
110
112
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
|
111
113
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
|
@@ -114,6 +116,8 @@ static void usage(const char * executable) {
|
|
|
114
116
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
|
115
117
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
|
116
118
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
|
119
|
+
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
|
|
120
|
+
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
|
|
117
121
|
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
|
118
122
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
119
123
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
|
@@ -244,6 +248,107 @@ static ggml_type parse_ggml_type(const char * arg) {
|
|
|
244
248
|
return GGML_TYPE_COUNT;
|
|
245
249
|
}
|
|
246
250
|
|
|
251
|
+
// Allowed tensors for arbitrary quantization with --tensor-type option
|
|
252
|
+
static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
|
|
253
|
+
"attn_k",
|
|
254
|
+
"attn_kv_a_mqa",
|
|
255
|
+
"attn_kv_b",
|
|
256
|
+
"attn_o",
|
|
257
|
+
"attn_output",
|
|
258
|
+
"attn_q",
|
|
259
|
+
"attn_q_a",
|
|
260
|
+
"attn_q_b",
|
|
261
|
+
"attn_qkv",
|
|
262
|
+
"attn_v",
|
|
263
|
+
"channel_mix_key",
|
|
264
|
+
"channel_mix_receptance",
|
|
265
|
+
"channel_mix_value",
|
|
266
|
+
"cls",
|
|
267
|
+
"cls.output",
|
|
268
|
+
"cross_attn_k",
|
|
269
|
+
"cross_attn_o",
|
|
270
|
+
"cross_attn_q",
|
|
271
|
+
"cross_attn_v",
|
|
272
|
+
"ffn_act",
|
|
273
|
+
"ffn_down",
|
|
274
|
+
"ffn_down_exps",
|
|
275
|
+
"ffn_down_shexp",
|
|
276
|
+
"ffn_gate",
|
|
277
|
+
"ffn_gate_exps",
|
|
278
|
+
"ffn_gate_shexp",
|
|
279
|
+
"ffn_up",
|
|
280
|
+
"ffn_up_exps",
|
|
281
|
+
"ffn_up_shexp",
|
|
282
|
+
"ssm_in",
|
|
283
|
+
"ssm_out",
|
|
284
|
+
"time_mix_gate",
|
|
285
|
+
"time_mix_key",
|
|
286
|
+
"time_mix_output",
|
|
287
|
+
"time_mix_receptance",
|
|
288
|
+
"time_mix_value",
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
// changes to this struct must be replicated in llama-quant.cpp
|
|
292
|
+
struct tensor_quantization {
|
|
293
|
+
std::string name;
|
|
294
|
+
ggml_type quant = GGML_TYPE_COUNT;
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
|
|
298
|
+
const char * sep = strchr(data, '=');
|
|
299
|
+
if (sep == nullptr) {
|
|
300
|
+
printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
const size_t tn_len = sep - data;
|
|
305
|
+
if (tn_len == 0) {
|
|
306
|
+
printf("\n%s: missing tensor name\n\n", __func__);
|
|
307
|
+
return false;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if (const size_t qt_len = strlen(sep); qt_len == 1) {
|
|
311
|
+
printf("\n%s: missing quantization type\n\n", __func__);
|
|
312
|
+
return false;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
std::string tn(data, tn_len);
|
|
316
|
+
std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
|
|
317
|
+
sep++;
|
|
318
|
+
const std::string qt(sep);
|
|
319
|
+
|
|
320
|
+
bool found = false;
|
|
321
|
+
for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
|
|
322
|
+
std::string tensor;
|
|
323
|
+
tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
|
|
324
|
+
// handle special case of cls.output
|
|
325
|
+
std::string cls_output = "cls.output";
|
|
326
|
+
if (tn.find(cls_output) != std::string::npos) {
|
|
327
|
+
tensor = "cls.output";
|
|
328
|
+
}
|
|
329
|
+
// check if an allowed tensor exists and it's at the end of the kv string
|
|
330
|
+
if (tensor == allowed) {
|
|
331
|
+
found = true;
|
|
332
|
+
break;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
if (!found) {
|
|
336
|
+
printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
|
|
337
|
+
return false;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
|
|
341
|
+
printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
|
|
342
|
+
return false;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
tensor_quantization tqz;
|
|
346
|
+
tqz.name = tn;
|
|
347
|
+
tqz.quant = parse_ggml_type(qt.c_str());
|
|
348
|
+
tensor_type.emplace_back(std::move(tqz));
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
|
|
247
352
|
int main(int argc, char ** argv) {
|
|
248
353
|
if (argc < 3) {
|
|
249
354
|
usage(argv[0]);
|
|
@@ -255,6 +360,7 @@ int main(int argc, char ** argv) {
|
|
|
255
360
|
std::string imatrix_file;
|
|
256
361
|
std::vector<std::string> included_weights, excluded_weights;
|
|
257
362
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
363
|
+
std::vector<tensor_quantization> tensor_types;
|
|
258
364
|
|
|
259
365
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
|
260
366
|
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
|
@@ -277,6 +383,10 @@ int main(int argc, char ** argv) {
|
|
|
277
383
|
} else {
|
|
278
384
|
usage(argv[0]);
|
|
279
385
|
}
|
|
386
|
+
} else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
|
|
387
|
+
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
|
|
388
|
+
usage(argv[0]);
|
|
389
|
+
}
|
|
280
390
|
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
|
281
391
|
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
|
282
392
|
usage(argv[0]);
|
|
@@ -361,6 +471,9 @@ int main(int argc, char ** argv) {
|
|
|
361
471
|
kv_overrides.back().key[0] = 0;
|
|
362
472
|
params.kv_overrides = &kv_overrides;
|
|
363
473
|
}
|
|
474
|
+
if (!tensor_types.empty()) {
|
|
475
|
+
params.tensor_types = &tensor_types;
|
|
476
|
+
}
|
|
364
477
|
|
|
365
478
|
llama_backend_init();
|
|
366
479
|
|