@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -390,6 +390,7 @@ extern "C" {
|
|
|
390
390
|
void * imatrix; // pointer to importance matrix data
|
|
391
391
|
void * kv_overrides; // pointer to vector containing overrides
|
|
392
392
|
void * tensor_types; // pointer to vector containing tensor types
|
|
393
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
|
393
394
|
} llama_model_quantize_params;
|
|
394
395
|
|
|
395
396
|
typedef struct llama_logit_bias {
|
|
@@ -943,12 +944,14 @@ extern "C" {
|
|
|
943
944
|
// Requires the context to have a memory.
|
|
944
945
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
945
946
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
946
|
-
// Upon
|
|
947
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
|
948
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
|
949
|
+
// Upon other return values, the memory state is restored to the state before this call
|
|
947
950
|
// 0 - success
|
|
948
951
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
949
|
-
// 2 - aborted
|
|
952
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
|
950
953
|
// -1 - invalid input batch
|
|
951
|
-
// < -1 - error
|
|
954
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
|
952
955
|
LLAMA_API int32_t llama_decode(
|
|
953
956
|
struct llama_context * ctx,
|
|
954
957
|
struct llama_batch batch);
|
|
@@ -1044,6 +1047,7 @@ extern "C" {
|
|
|
1044
1047
|
|
|
1045
1048
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1046
1049
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1050
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
1047
1051
|
|
|
1048
1052
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
1049
1053
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -1087,6 +1091,7 @@ extern "C" {
|
|
|
1087
1091
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
1088
1092
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
1089
1093
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1094
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
1090
1095
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
1091
1096
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
1092
1097
|
/// as plaintext. Does not insert a leading space.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/LlamaCppModel.cpp
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <cstdlib>
|
|
6
6
|
#include <ctime>
|
|
7
7
|
#include <chrono>
|
|
8
|
+
#include <thread>
|
|
8
9
|
#include <fstream>
|
|
9
10
|
#include <iostream>
|
|
10
11
|
#include <random>
|
|
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
|
|
|
50
51
|
}
|
|
51
52
|
|
|
52
53
|
void LlamaCppModel::release() {
|
|
53
|
-
//
|
|
54
|
+
// Signal completion to stop and wait for it to finish gracefully
|
|
54
55
|
if (is_predicting_) {
|
|
55
56
|
should_stop_completion_ = true;
|
|
56
57
|
|
|
57
|
-
//
|
|
58
|
+
// Wait more patiently for completion to stop, with proper backoff
|
|
58
59
|
int retry = 0;
|
|
59
|
-
while (is_predicting_ && retry <
|
|
60
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
|
60
|
+
while (is_predicting_ && retry < 100) { // Increased from 10 to 100
|
|
61
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
|
|
61
62
|
retry++;
|
|
62
63
|
}
|
|
64
|
+
|
|
65
|
+
// Force stop if still predicting
|
|
66
|
+
if (is_predicting_) {
|
|
67
|
+
is_predicting_ = false;
|
|
68
|
+
}
|
|
63
69
|
}
|
|
64
70
|
|
|
65
|
-
// Clean up our resources
|
|
71
|
+
// Clean up our resources with proper mutex protection
|
|
66
72
|
if (rn_ctx_) {
|
|
73
|
+
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
74
|
+
|
|
75
|
+
// Clear KV cache before freeing context (following server.cpp pattern)
|
|
67
76
|
if (rn_ctx_->ctx) {
|
|
77
|
+
try {
|
|
78
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
79
|
+
} catch (...) {
|
|
80
|
+
// Ignore errors during cache clearing
|
|
81
|
+
}
|
|
82
|
+
|
|
68
83
|
llama_free(rn_ctx_->ctx);
|
|
69
84
|
rn_ctx_->ctx = nullptr;
|
|
70
85
|
}
|
|
71
86
|
|
|
87
|
+
// Free model after context (following server.cpp cleanup order)
|
|
72
88
|
if (rn_ctx_->model) {
|
|
73
89
|
llama_model_free(rn_ctx_->model);
|
|
74
90
|
rn_ctx_->model = nullptr;
|
|
75
91
|
}
|
|
76
92
|
|
|
93
|
+
// Clean up additional resources
|
|
94
|
+
rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
|
|
95
|
+
rn_ctx_->chat_templates.reset(); // Clean up chat templates
|
|
96
|
+
rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
|
|
97
|
+
|
|
98
|
+
// Reset state flags
|
|
99
|
+
rn_ctx_->model_loaded = false;
|
|
100
|
+
|
|
77
101
|
// Note: rn_ctx_ itself is owned by the module, so we don't delete it here
|
|
78
102
|
rn_ctx_ = nullptr;
|
|
79
103
|
}
|
|
104
|
+
|
|
105
|
+
// Reset our internal state
|
|
106
|
+
should_stop_completion_ = false;
|
|
107
|
+
is_predicting_ = false;
|
|
80
108
|
}
|
|
81
109
|
|
|
82
110
|
int32_t LlamaCppModel::getVocabSize() const {
|
|
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
|
|
|
133
161
|
options.min_p = obj.getProperty(rt, "min_p").asNumber();
|
|
134
162
|
}
|
|
135
163
|
|
|
164
|
+
if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
|
|
165
|
+
options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
|
|
166
|
+
}
|
|
167
|
+
|
|
136
168
|
if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
|
|
137
169
|
options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
|
|
138
170
|
} else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
|
|
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
365
397
|
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
366
398
|
|
|
367
399
|
// Clear the context KV cache
|
|
368
|
-
|
|
400
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
369
401
|
|
|
370
402
|
// Store original sampling parameters to restore later
|
|
371
403
|
float orig_temp = rn_ctx_->params.sampling.temp;
|
|
372
404
|
float orig_top_p = rn_ctx_->params.sampling.top_p;
|
|
373
405
|
float orig_top_k = rn_ctx_->params.sampling.top_k;
|
|
374
406
|
float orig_min_p = rn_ctx_->params.sampling.min_p;
|
|
407
|
+
float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
|
|
375
408
|
int orig_n_predict = rn_ctx_->params.n_predict;
|
|
376
409
|
|
|
377
410
|
// Set sampling parameters from options
|
|
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
379
412
|
rn_ctx_->params.sampling.top_p = options.top_p;
|
|
380
413
|
rn_ctx_->params.sampling.top_k = options.top_k;
|
|
381
414
|
rn_ctx_->params.sampling.min_p = options.min_p;
|
|
415
|
+
rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
|
|
382
416
|
rn_ctx_->params.n_predict = options.n_predict;
|
|
383
417
|
|
|
384
418
|
// Check for a partial callback
|
|
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
426
460
|
rn_ctx_->params.sampling.top_p = orig_top_p;
|
|
427
461
|
rn_ctx_->params.sampling.top_k = orig_top_k;
|
|
428
462
|
rn_ctx_->params.sampling.min_p = orig_min_p;
|
|
463
|
+
rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
|
|
429
464
|
rn_ctx_->params.n_predict = orig_n_predict;
|
|
430
465
|
|
|
431
466
|
return result;
|
|
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
|
|
|
885
920
|
}
|
|
886
921
|
|
|
887
922
|
// Clear the context KV cache to ensure clean embedding
|
|
888
|
-
|
|
923
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
889
924
|
|
|
890
925
|
// Enable embedding mode
|
|
891
926
|
llama_set_embeddings(rn_ctx_->ctx, true);
|
|
892
927
|
|
|
893
|
-
//
|
|
928
|
+
// Create and populate batch using common_batch functions (following server.cpp pattern)
|
|
929
|
+
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
|
|
930
|
+
|
|
931
|
+
common_batch_clear(batch);
|
|
894
932
|
for (int i = 0; i < (int)tokens.size(); i++) {
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
/* token */ &token,
|
|
899
|
-
/* embd */ nullptr,
|
|
900
|
-
/* pos */ &i,
|
|
901
|
-
/* n_seq_id */ nullptr,
|
|
902
|
-
/* seq_id */ nullptr,
|
|
903
|
-
/* logits */ nullptr
|
|
904
|
-
};
|
|
905
|
-
|
|
906
|
-
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
907
|
-
throw std::runtime_error("Failed to decode token for embedding");
|
|
908
|
-
}
|
|
933
|
+
// For embeddings, we typically need logits for the last token (for pooling)
|
|
934
|
+
bool needs_logits = (i == (int)tokens.size() - 1);
|
|
935
|
+
common_batch_add(batch, tokens[i], i, {0}, needs_logits);
|
|
909
936
|
}
|
|
910
937
|
|
|
938
|
+
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
939
|
+
llama_batch_free(batch);
|
|
940
|
+
throw std::runtime_error("Failed to decode tokens for embedding");
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
llama_batch_free(batch);
|
|
944
|
+
|
|
911
945
|
// Get embedding size from the model
|
|
912
946
|
const int n_embd = llama_model_n_embd(rn_ctx_->model);
|
|
913
947
|
if (n_embd <= 0) {
|
package/cpp/build-info.cpp
CHANGED
|
@@ -95,7 +95,7 @@ endif()
|
|
|
95
95
|
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
96
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
97
|
endif()
|
|
98
|
-
set(LLAMA_INSTALL_VERSION 0.0.${
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
99
99
|
|
|
100
100
|
# override ggml options
|
|
101
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2706
2706
|
params.embd_sep = value;
|
|
2707
2707
|
}
|
|
2708
2708
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
|
+
add_opt(common_arg(
|
|
2710
|
+
{"--cls-separator"}, "STRING",
|
|
2711
|
+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
|
2712
|
+
[](common_params & params, const std::string & value) {
|
|
2713
|
+
params.cls_sep = value;
|
|
2714
|
+
}
|
|
2715
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
2716
|
add_opt(common_arg(
|
|
2710
2717
|
{"--host"}, "HOST",
|
|
2711
2718
|
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
|
|
|
1290
1290
|
int n_tokens = text.length() + 2 * add_special;
|
|
1291
1291
|
std::vector<llama_token> result(n_tokens);
|
|
1292
1292
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
1293
|
+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
|
1294
|
+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
|
1295
|
+
}
|
|
1293
1296
|
if (n_tokens < 0) {
|
|
1294
1297
|
result.resize(-n_tokens);
|
|
1295
1298
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
@@ -358,6 +358,7 @@ struct common_params {
|
|
|
358
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
359
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
360
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
361
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
|
361
362
|
|
|
362
363
|
// server params
|
|
363
364
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
|
45
|
-
class string_view {
|
|
46
|
-
const std::string & _str;
|
|
47
|
-
const size_t _start;
|
|
48
|
-
const size_t _end;
|
|
49
|
-
public:
|
|
50
|
-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
|
51
|
-
|
|
52
|
-
size_t size() const {
|
|
53
|
-
return _end - _start;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
size_t length() const {
|
|
57
|
-
return size();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
operator std::string() const {
|
|
61
|
-
return str();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
std::string str() const {
|
|
65
|
-
return _str.substr(_start, _end - _start);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
|
69
|
-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
char operator[](size_t pos) const {
|
|
73
|
-
auto index = _start + pos;
|
|
74
|
-
if (index >= _end) {
|
|
75
|
-
throw std::out_of_range("string_view index out of range");
|
|
76
|
-
}
|
|
77
|
-
return _str[_start + pos];
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
bool operator==(const string_view & other) const {
|
|
81
|
-
std::string this_str = *this;
|
|
82
|
-
std::string other_str = other;
|
|
83
|
-
return this_str == other_str;
|
|
84
|
-
}
|
|
85
|
-
};
|
|
86
|
-
|
|
87
44
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
88
45
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
|
89
46
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
|
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
112
69
|
}
|
|
113
70
|
out << "}";
|
|
114
71
|
};
|
|
115
|
-
std::function<void(const string_view &, const string_view &)> uniform_range =
|
|
116
|
-
[&](const string_view & from, const string_view & to) {
|
|
72
|
+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
|
|
73
|
+
[&](const std::string_view & from, const std::string_view & to) {
|
|
117
74
|
size_t i = 0;
|
|
118
75
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
|
119
76
|
i++;
|
|
120
77
|
}
|
|
121
78
|
if (i > 0) {
|
|
122
|
-
out << "\"" << from.substr(0, i)
|
|
79
|
+
out << "\"" << from.substr(0, i) << "\"";
|
|
123
80
|
}
|
|
124
81
|
if (i < from.length() && i < to.length()) {
|
|
125
82
|
if (i > 0) {
|
|
@@ -310,6 +310,8 @@ class ModelBase:
|
|
|
310
310
|
gguf.MODEL_TENSOR.POSNET_NORM2,
|
|
311
311
|
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
|
|
312
312
|
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
|
|
313
|
+
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
314
|
+
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
313
315
|
)
|
|
314
316
|
)
|
|
315
317
|
or not new_name.endswith(".weight")
|
|
@@ -320,7 +322,11 @@ class ModelBase:
|
|
|
320
322
|
self.match_model_tensor_name(new_name, key, bid)
|
|
321
323
|
for key in (
|
|
322
324
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
|
325
|
+
gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
323
326
|
gguf.MODEL_TENSOR.OUTPUT,
|
|
327
|
+
gguf.MODEL_TENSOR.ALTUP_ROUTER,
|
|
328
|
+
gguf.MODEL_TENSOR.LAUREL_L,
|
|
329
|
+
gguf.MODEL_TENSOR.LAUREL_R,
|
|
324
330
|
)
|
|
325
331
|
):
|
|
326
332
|
if self.ftype in (
|
|
@@ -921,13 +927,20 @@ class TextModel(ModelBase):
|
|
|
921
927
|
tokenizer = SentencePieceProcessor()
|
|
922
928
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
|
923
929
|
|
|
924
|
-
vocab_size = self.
|
|
930
|
+
vocab_size = self.find_hparam([
|
|
931
|
+
"vocab_size_per_layer_input", # gemma3n
|
|
932
|
+
"vocab_size",
|
|
933
|
+
], optional=True) or tokenizer.vocab_size()
|
|
925
934
|
|
|
926
935
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
|
927
936
|
scores: list[float] = [-10000.0] * vocab_size
|
|
928
937
|
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
|
929
938
|
|
|
930
939
|
for token_id in range(tokenizer.vocab_size()):
|
|
940
|
+
if token_id >= vocab_size:
|
|
941
|
+
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
|
|
942
|
+
break
|
|
943
|
+
|
|
931
944
|
piece = tokenizer.IdToPiece(token_id)
|
|
932
945
|
text = piece.encode("utf-8")
|
|
933
946
|
score = tokenizer.GetScore(token_id)
|
|
@@ -2145,7 +2158,6 @@ class Llama4Model(LlamaModel):
|
|
|
2145
2158
|
|
|
2146
2159
|
def set_vocab(self):
|
|
2147
2160
|
self._set_vocab_gpt2()
|
|
2148
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
2149
2161
|
|
|
2150
2162
|
def set_gguf_parameters(self):
|
|
2151
2163
|
super().set_gguf_parameters()
|
|
@@ -2194,7 +2206,7 @@ class Llama4VisionModel(MmprojModel):
|
|
|
2194
2206
|
name += ".weight"
|
|
2195
2207
|
if "multi_modal_projector.linear_1" in name:
|
|
2196
2208
|
# despite the name with number postfix, this is a single fully connected layer
|
|
2197
|
-
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
|
2209
|
+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
|
|
2198
2210
|
return [(self.map_tensor_name(name), data_torch)]
|
|
2199
2211
|
return []
|
|
2200
2212
|
|
|
@@ -3918,9 +3930,6 @@ class BertModel(TextModel):
|
|
|
3918
3930
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
3919
3931
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
3920
3932
|
|
|
3921
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3922
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3923
|
-
|
|
3924
3933
|
|
|
3925
3934
|
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
|
3926
3935
|
class DistilBertModel(BertModel):
|
|
@@ -3962,8 +3971,6 @@ class RobertaModel(BertModel):
|
|
|
3962
3971
|
bpe_tok_path = self.dir_model / "tokenizer.json"
|
|
3963
3972
|
if bpe_tok_path.exists():
|
|
3964
3973
|
self._set_vocab_gpt2()
|
|
3965
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3966
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3967
3974
|
|
|
3968
3975
|
# we need this to validate the size of the token_type embeddings
|
|
3969
3976
|
# though currently we are passing all zeros to the token_type embeddings
|
|
@@ -4223,6 +4230,7 @@ class Gemma2Model(TextModel):
|
|
|
4223
4230
|
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
|
4224
4231
|
class Gemma3Model(TextModel):
|
|
4225
4232
|
model_arch = gguf.MODEL_ARCH.GEMMA3
|
|
4233
|
+
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
|
4226
4234
|
|
|
4227
4235
|
def set_vocab(self):
|
|
4228
4236
|
self._set_vocab_sentencepiece()
|
|
@@ -4244,9 +4252,8 @@ class Gemma3Model(TextModel):
|
|
|
4244
4252
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
|
4245
4253
|
self.gguf_writer.add_file_type(self.ftype)
|
|
4246
4254
|
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
4247
|
-
#
|
|
4255
|
+
# attn_logit_softcapping is removed in Gemma3
|
|
4248
4256
|
assert hparams.get("attn_logit_softcapping") is None
|
|
4249
|
-
assert hparams.get("final_logit_softcapping") is None
|
|
4250
4257
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
|
4251
4258
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
|
4252
4259
|
if hparams.get("rope_scaling") is not None:
|
|
@@ -4258,7 +4265,7 @@ class Gemma3Model(TextModel):
|
|
|
4258
4265
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4259
4266
|
del bid # unused
|
|
4260
4267
|
|
|
4261
|
-
if
|
|
4268
|
+
if "language_model." in name:
|
|
4262
4269
|
name = name.replace("language_model.", "")
|
|
4263
4270
|
|
|
4264
4271
|
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
|
@@ -4273,8 +4280,9 @@ class Gemma3Model(TextModel):
|
|
|
4273
4280
|
|
|
4274
4281
|
# ref code in Gemma3RMSNorm
|
|
4275
4282
|
# output = output * (1.0 + self.weight.float())
|
|
4283
|
+
# note: this is not the case on gemma3n
|
|
4276
4284
|
if name.endswith("norm.weight"):
|
|
4277
|
-
data_torch = data_torch +
|
|
4285
|
+
data_torch = data_torch + self.norm_shift
|
|
4278
4286
|
|
|
4279
4287
|
return [(self.map_tensor_name(name), data_torch)]
|
|
4280
4288
|
|
|
@@ -4331,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel):
|
|
|
4331
4339
|
return [] # skip other tensors
|
|
4332
4340
|
|
|
4333
4341
|
|
|
4342
|
+
@ModelBase.register("Gemma3nForConditionalGeneration")
|
|
4343
|
+
class Gemma3NModel(Gemma3Model):
|
|
4344
|
+
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
|
4345
|
+
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
|
4346
|
+
|
|
4347
|
+
_altup_proj: list[Tensor] = []
|
|
4348
|
+
_altup_unembd: list[Tensor] = []
|
|
4349
|
+
|
|
4350
|
+
def __init__(self, *args, **kwargs):
|
|
4351
|
+
super().__init__(*args, **kwargs)
|
|
4352
|
+
assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
|
|
4353
|
+
self._altup_proj = [
|
|
4354
|
+
torch.Tensor(), # to be replaced
|
|
4355
|
+
torch.Tensor(), # to be replaced
|
|
4356
|
+
torch.Tensor(), # to be replaced
|
|
4357
|
+
]
|
|
4358
|
+
self._altup_unembd = [
|
|
4359
|
+
torch.Tensor(), # to be replaced
|
|
4360
|
+
torch.Tensor(), # to be replaced
|
|
4361
|
+
torch.Tensor(), # to be replaced
|
|
4362
|
+
]
|
|
4363
|
+
|
|
4364
|
+
def set_vocab(self):
|
|
4365
|
+
with open(self.dir_model / "chat_template.jinja") as f:
|
|
4366
|
+
# quick hack to make sure chat template is added
|
|
4367
|
+
self.gguf_writer.add_chat_template(f.read())
|
|
4368
|
+
super().set_vocab()
|
|
4369
|
+
|
|
4370
|
+
def set_gguf_parameters(self):
|
|
4371
|
+
super().set_gguf_parameters()
|
|
4372
|
+
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
|
4373
|
+
self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
|
|
4374
|
+
self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
|
|
4375
|
+
self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
|
|
4376
|
+
|
|
4377
|
+
activation_sparsity_scale = []
|
|
4378
|
+
for s in self.hparams["activation_sparsity_pattern"]:
|
|
4379
|
+
normal_dist = torch.distributions.normal.Normal(0, 1)
|
|
4380
|
+
std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
|
|
4381
|
+
activation_sparsity_scale.append(std_multiplier.item())
|
|
4382
|
+
self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
|
|
4383
|
+
|
|
4384
|
+
sliding_window_pattern = []
|
|
4385
|
+
for t in self.hparams["layer_types"]:
|
|
4386
|
+
sliding_window_pattern.append(t == "sliding_attention")
|
|
4387
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
4388
|
+
|
|
4389
|
+
def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
|
|
4390
|
+
has_all = all(m.numel() > 0 for m in matrices)
|
|
4391
|
+
if not has_all:
|
|
4392
|
+
return None
|
|
4393
|
+
else:
|
|
4394
|
+
return torch.stack(matrices, dim=0)
|
|
4395
|
+
|
|
4396
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4397
|
+
if name.endswith("_scale"):
|
|
4398
|
+
name = name + ".weight"
|
|
4399
|
+
|
|
4400
|
+
# TODO: implement self.prediction_coefs.weight.clamp_(...)
|
|
4401
|
+
|
|
4402
|
+
if "language_model." not in name:
|
|
4403
|
+
return [] # skip non-language model tensors
|
|
4404
|
+
|
|
4405
|
+
if "altup_unembed_projections" in name:
|
|
4406
|
+
data_torch = data_torch.to(device="cpu")
|
|
4407
|
+
if ".0." in name:
|
|
4408
|
+
self._altup_unembd[0] = data_torch
|
|
4409
|
+
elif ".1." in name:
|
|
4410
|
+
self._altup_unembd[1] = data_torch
|
|
4411
|
+
elif ".2." in name:
|
|
4412
|
+
self._altup_unembd[2] = data_torch
|
|
4413
|
+
else:
|
|
4414
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4415
|
+
out = self._stack_matrices(self._altup_unembd)
|
|
4416
|
+
if out is not None:
|
|
4417
|
+
return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
|
|
4418
|
+
else:
|
|
4419
|
+
return []
|
|
4420
|
+
|
|
4421
|
+
if "altup_projections" in name:
|
|
4422
|
+
data_torch = data_torch.to(device="cpu")
|
|
4423
|
+
if ".0." in name:
|
|
4424
|
+
self._altup_proj[0] = data_torch
|
|
4425
|
+
elif ".1." in name:
|
|
4426
|
+
self._altup_proj[1] = data_torch
|
|
4427
|
+
elif ".2." in name:
|
|
4428
|
+
self._altup_proj[2] = data_torch
|
|
4429
|
+
else:
|
|
4430
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4431
|
+
out = self._stack_matrices(self._altup_proj)
|
|
4432
|
+
if out is not None:
|
|
4433
|
+
return [(self.map_tensor_name("model.altup_projections.weight"), out)]
|
|
4434
|
+
else:
|
|
4435
|
+
return []
|
|
4436
|
+
|
|
4437
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4438
|
+
|
|
4439
|
+
|
|
4334
4440
|
@ModelBase.register("Starcoder2ForCausalLM")
|
|
4335
4441
|
class StarCoder2Model(TextModel):
|
|
4336
4442
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
|
@@ -4848,8 +4954,6 @@ class JinaBertV2Model(BertModel):
|
|
|
4848
4954
|
self.gguf_writer.add_token_type_count(2)
|
|
4849
4955
|
else:
|
|
4850
4956
|
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
|
4851
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
4852
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
4853
4957
|
|
|
4854
4958
|
|
|
4855
4959
|
@ModelBase.register("OpenELMForCausalLM")
|
|
@@ -5451,9 +5555,6 @@ class T5Model(TextModel):
|
|
|
5451
5555
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5452
5556
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5453
5557
|
|
|
5454
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5455
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5456
|
-
|
|
5457
5558
|
def set_gguf_parameters(self):
|
|
5458
5559
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5459
5560
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -5591,9 +5692,6 @@ class T5EncoderModel(TextModel):
|
|
|
5591
5692
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5592
5693
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5593
5694
|
|
|
5594
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5595
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5596
|
-
|
|
5597
5695
|
def set_gguf_parameters(self):
|
|
5598
5696
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5599
5697
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" ON)
|
|
134
135
|
|
|
135
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
136
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -101,6 +101,7 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
105
106
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
106
107
|
|
|
@@ -133,6 +134,7 @@ extern "C" {
|
|
|
133
134
|
|
|
134
135
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
135
136
|
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
136
138
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
137
139
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
138
140
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|