cactus-react-native 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +20 -0
- package/README.md +3 -1
- package/android/src/main/CMakeLists.txt +58 -23
- package/android/src/main/java/com/cactus/Cactus.java +484 -16
- package/android/src/main/java/com/cactus/LlamaContext.java +199 -0
- package/android/src/main/jni.cpp +325 -10
- package/android/src/main/jniLibs/arm64-v8a/libcactus.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libcactus_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/libcactus.so +0 -0
- package/android/src/main/jniLibs/x86_64/libcactus_x86_64.so +0 -0
- package/android/src/newarch/java/com/cactus/CactusModule.java +79 -7
- package/android/src/oldarch/java/com/cactus/CactusModule.java +70 -0
- package/cactus-react-native.podspec +0 -3
- package/ios/CMakeLists.txt +58 -36
- package/ios/Cactus.mm +243 -2
- package/ios/CactusContext.h +22 -0
- package/ios/CactusContext.mm +176 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/ggml-llama.metallib +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/ggml-llama.metallib +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus.h +92 -5
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/cactus_ffi.h +268 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/chat.h +2 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/common.h +42 -51
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-backend.h +4 -4
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-common.h +12 -6
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpp.h +1 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu.h +5 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-impl.h +52 -18
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-metal-impl.h +106 -14
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-opt.h +49 -28
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml.h +87 -106
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-arch.h +16 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-batch.h +2 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-chat.h +7 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-context.h +44 -33
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-cparams.h +1 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-graph.h +83 -17
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-hparams.h +44 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-kv-cache.h +407 -179
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-memory.h +13 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-loader.h +5 -3
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model-saver.h +37 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-model.h +24 -2
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama-vocab.h +6 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/llama.h +102 -142
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/chat-template.hpp +23 -11
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/minja/minja.hpp +186 -127
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/ggml-llama-sim.metallib +0 -0
- package/lib/commonjs/NativeCactus.js +1 -0
- package/lib/commonjs/NativeCactus.js.map +1 -1
- package/lib/commonjs/index.js +112 -0
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/tools.js +118 -0
- package/lib/commonjs/tools.js.map +1 -0
- package/lib/module/NativeCactus.js +3 -0
- package/lib/module/NativeCactus.js.map +1 -1
- package/lib/module/index.js +87 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/tools.js +110 -0
- package/lib/module/tools.js.map +1 -0
- package/lib/typescript/NativeCactus.d.ts +30 -1
- package/lib/typescript/NativeCactus.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +21 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/lib/typescript/tools.d.ts +38 -0
- package/lib/typescript/tools.d.ts.map +1 -0
- package/package.json +6 -3
- package/src/NativeCactus.ts +62 -1
- package/src/index.ts +113 -2
- package/src/tools.ts +127 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/ios-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/tvos-arm64/cactus.framework/Headers/sgemm.h +0 -14
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-impl.h +0 -531
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/cactus.xcframework/tvos-arm64_x86_64-simulator/cactus.framework/Headers/sgemm.h +0 -14
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
#include <set>
|
|
8
8
|
#include <string>
|
|
9
|
+
#include <string_view>
|
|
9
10
|
#include <vector>
|
|
10
11
|
#include <sstream>
|
|
11
12
|
|
|
@@ -42,17 +43,6 @@ extern const char * LLAMA_BUILD_TARGET;
|
|
|
42
43
|
|
|
43
44
|
struct common_control_vector_load_info;
|
|
44
45
|
|
|
45
|
-
#define print_build_info() do { \
|
|
46
|
-
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
|
47
|
-
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
|
48
|
-
} while(0)
|
|
49
|
-
|
|
50
|
-
// build info
|
|
51
|
-
extern int LLAMA_BUILD_NUMBER;
|
|
52
|
-
extern char const *LLAMA_COMMIT;
|
|
53
|
-
extern char const *LLAMA_COMPILER;
|
|
54
|
-
extern char const *LLAMA_BUILD_TARGET;
|
|
55
|
-
|
|
56
46
|
//
|
|
57
47
|
// CPU utils
|
|
58
48
|
//
|
|
@@ -77,7 +67,6 @@ enum llama_example {
|
|
|
77
67
|
LLAMA_EXAMPLE_COMMON,
|
|
78
68
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
79
69
|
LLAMA_EXAMPLE_MAIN,
|
|
80
|
-
LLAMA_EXAMPLE_INFILL,
|
|
81
70
|
LLAMA_EXAMPLE_EMBEDDING,
|
|
82
71
|
LLAMA_EXAMPLE_PERPLEXITY,
|
|
83
72
|
LLAMA_EXAMPLE_RETRIEVAL,
|
|
@@ -87,7 +76,7 @@ enum llama_example {
|
|
|
87
76
|
LLAMA_EXAMPLE_SERVER,
|
|
88
77
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
89
78
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
90
|
-
|
|
79
|
+
LLAMA_EXAMPLE_MTMD,
|
|
91
80
|
LLAMA_EXAMPLE_LOOKUP,
|
|
92
81
|
LLAMA_EXAMPLE_PARALLEL,
|
|
93
82
|
LLAMA_EXAMPLE_TTS,
|
|
@@ -107,6 +96,7 @@ enum common_sampler_type {
|
|
|
107
96
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
108
97
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
109
98
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
99
|
+
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
|
110
100
|
};
|
|
111
101
|
|
|
112
102
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -132,10 +122,6 @@ struct common_grammar_trigger {
|
|
|
132
122
|
common_grammar_trigger_type type;
|
|
133
123
|
std::string value;
|
|
134
124
|
llama_token token = LLAMA_TOKEN_NULL;
|
|
135
|
-
|
|
136
|
-
// T can only be nlohmann::ordered_json
|
|
137
|
-
template <class T> T to_json() const;
|
|
138
|
-
template <class T> static common_grammar_trigger from_json(const T & in);
|
|
139
125
|
};
|
|
140
126
|
|
|
141
127
|
// sampling parameters
|
|
@@ -176,6 +162,7 @@ struct common_params_sampling {
|
|
|
176
162
|
std::vector<enum common_sampler_type> samplers = {
|
|
177
163
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
178
164
|
COMMON_SAMPLER_TYPE_DRY,
|
|
165
|
+
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
|
179
166
|
COMMON_SAMPLER_TYPE_TOP_K,
|
|
180
167
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
181
168
|
COMMON_SAMPLER_TYPE_TOP_P,
|
|
@@ -195,6 +182,13 @@ struct common_params_sampling {
|
|
|
195
182
|
std::string print() const;
|
|
196
183
|
};
|
|
197
184
|
|
|
185
|
+
struct common_params_model {
|
|
186
|
+
std::string path = ""; // model local path // NOLINT
|
|
187
|
+
std::string url = ""; // model url to download // NOLINT
|
|
188
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
189
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
190
|
+
};
|
|
191
|
+
|
|
198
192
|
struct common_params_speculative {
|
|
199
193
|
std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
|
|
200
194
|
|
|
@@ -208,19 +202,11 @@ struct common_params_speculative {
|
|
|
208
202
|
struct cpu_params cpuparams;
|
|
209
203
|
struct cpu_params cpuparams_batch;
|
|
210
204
|
|
|
211
|
-
|
|
212
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
213
|
-
|
|
214
|
-
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
215
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
205
|
+
struct common_params_model model;
|
|
216
206
|
};
|
|
217
207
|
|
|
218
208
|
struct common_params_vocoder {
|
|
219
|
-
|
|
220
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
221
|
-
|
|
222
|
-
std::string model = ""; // model path // NOLINT
|
|
223
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
209
|
+
struct common_params_model model;
|
|
224
210
|
|
|
225
211
|
std::string speaker_file = ""; // speaker file path // NOLINT
|
|
226
212
|
|
|
@@ -279,12 +265,10 @@ struct common_params {
|
|
|
279
265
|
struct common_params_speculative speculative;
|
|
280
266
|
struct common_params_vocoder vocoder;
|
|
281
267
|
|
|
282
|
-
|
|
268
|
+
struct common_params_model model;
|
|
269
|
+
|
|
283
270
|
std::string model_alias = ""; // model alias // NOLINT
|
|
284
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
285
271
|
std::string hf_token = ""; // HF token // NOLINT
|
|
286
|
-
std::string hf_repo = ""; // HF repo // NOLINT
|
|
287
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
288
272
|
std::string prompt = ""; // NOLINT
|
|
289
273
|
std::string system_prompt = ""; // NOLINT
|
|
290
274
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
|
@@ -298,6 +282,7 @@ struct common_params {
|
|
|
298
282
|
std::vector<std::string> in_files; // all input files
|
|
299
283
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
300
284
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
285
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
301
286
|
|
|
302
287
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
|
303
288
|
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
|
@@ -339,17 +324,17 @@ struct common_params {
|
|
|
339
324
|
bool flash_attn = false; // flash attention
|
|
340
325
|
bool no_perf = false; // disable performance metrics
|
|
341
326
|
bool ctx_shift = true; // context shift on inifinite text generation
|
|
327
|
+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
342
328
|
|
|
343
329
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
344
|
-
bool logits_all = false; // return logits for all tokens in the batch
|
|
345
330
|
bool use_mmap = true; // use mmap for faster loads
|
|
346
331
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
347
332
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
348
333
|
bool display_prompt = true; // print prompt before generation
|
|
349
|
-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
350
334
|
bool no_kv_offload = false; // disable KV offloading
|
|
351
335
|
bool warmup = true; // warmup run
|
|
352
336
|
bool check_tensors = false; // validate tensor data
|
|
337
|
+
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
353
338
|
|
|
354
339
|
bool single_turn = false; // single turn chat conversation
|
|
355
340
|
|
|
@@ -361,8 +346,10 @@ struct common_params {
|
|
|
361
346
|
|
|
362
347
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
|
363
348
|
|
|
364
|
-
// multimodal models (see
|
|
365
|
-
|
|
349
|
+
// multimodal models (see tools/mtmd)
|
|
350
|
+
struct common_params_model mmproj;
|
|
351
|
+
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
|
352
|
+
bool no_mmproj = false; // explicitly disable multimodal model
|
|
366
353
|
std::vector<std::string> image; // path to image file(s)
|
|
367
354
|
|
|
368
355
|
// embedding
|
|
@@ -385,6 +372,7 @@ struct common_params {
|
|
|
385
372
|
bool use_jinja = false; // NOLINT
|
|
386
373
|
bool enable_chat_template = true;
|
|
387
374
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
375
|
+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
388
376
|
|
|
389
377
|
std::vector<std::string> api_keys;
|
|
390
378
|
|
|
@@ -428,13 +416,14 @@ struct common_params {
|
|
|
428
416
|
|
|
429
417
|
bool process_output = false; // collect data for the output tensor
|
|
430
418
|
bool compute_ppl = true; // whether to compute perplexity
|
|
419
|
+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
|
431
420
|
|
|
432
421
|
// cvector-generator params
|
|
433
422
|
int n_pca_batch = 100;
|
|
434
423
|
int n_pca_iterations = 1000;
|
|
435
424
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
|
436
|
-
std::string cvector_positive_file = "
|
|
437
|
-
std::string cvector_negative_file = "
|
|
425
|
+
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
|
426
|
+
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
|
438
427
|
|
|
439
428
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
|
440
429
|
|
|
@@ -443,6 +432,11 @@ struct common_params {
|
|
|
443
432
|
|
|
444
433
|
// common params
|
|
445
434
|
std::string out_file; // output filename for all example programs
|
|
435
|
+
// optional callback for model loading progress and cancellation:
|
|
436
|
+
// called with a progress value between 0.0 and 1.0.
|
|
437
|
+
// return false from callback to abort model loading or true to continue
|
|
438
|
+
llama_progress_callback load_progress_callback = NULL;
|
|
439
|
+
void * load_progress_callback_user_data = NULL;
|
|
446
440
|
};
|
|
447
441
|
|
|
448
442
|
// call once at the start of a program if it uses libcommon
|
|
@@ -520,10 +514,9 @@ static bool string_starts_with(const std::string & str,
|
|
|
520
514
|
return str.rfind(prefix, 0) == 0;
|
|
521
515
|
}
|
|
522
516
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
}
|
|
517
|
+
// While we wait for C++20's std::string::ends_with...
|
|
518
|
+
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
|
519
|
+
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
|
527
520
|
|
|
528
521
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
529
522
|
void string_process_escapes(std::string & input);
|
|
@@ -564,6 +557,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
|
|
|
564
557
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
565
558
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
|
566
559
|
|
|
560
|
+
std::string get_model_endpoint();
|
|
561
|
+
|
|
567
562
|
//
|
|
568
563
|
// Batch utils
|
|
569
564
|
//
|
|
@@ -630,16 +625,6 @@ std::string common_detokenize(
|
|
|
630
625
|
const std::vector<llama_token> & tokens,
|
|
631
626
|
bool special = true);
|
|
632
627
|
|
|
633
|
-
//
|
|
634
|
-
// KV cache utils
|
|
635
|
-
//
|
|
636
|
-
|
|
637
|
-
// Dump the KV cache view with the number of sequences per cell.
|
|
638
|
-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
639
|
-
|
|
640
|
-
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
641
|
-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
642
|
-
|
|
643
628
|
//
|
|
644
629
|
// Embedding utils
|
|
645
630
|
//
|
|
@@ -681,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
|
681
666
|
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
682
667
|
|
|
683
668
|
}
|
|
669
|
+
|
|
670
|
+
//
|
|
671
|
+
// training utils
|
|
672
|
+
//
|
|
673
|
+
|
|
674
|
+
lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
|
@@ -38,7 +38,7 @@ extern "C" {
|
|
|
38
38
|
LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer (lm_ggml_backend_buffer_type_t buft, size_t size);
|
|
39
39
|
LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft);
|
|
40
40
|
LM_GGML_API size_t lm_ggml_backend_buft_get_max_size (lm_ggml_backend_buffer_type_t buft);
|
|
41
|
-
LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor);
|
|
41
|
+
LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
|
|
42
42
|
LM_GGML_API bool lm_ggml_backend_buft_is_host (lm_ggml_backend_buffer_type_t buft);
|
|
43
43
|
LM_GGML_API lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device (lm_ggml_backend_buffer_type_t buft);
|
|
44
44
|
|
|
@@ -59,7 +59,7 @@ extern "C" {
|
|
|
59
59
|
LM_GGML_API enum lm_ggml_status lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
|
|
60
60
|
LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer);
|
|
61
61
|
LM_GGML_API size_t lm_ggml_backend_buffer_get_max_size (lm_ggml_backend_buffer_t buffer);
|
|
62
|
-
LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
|
|
62
|
+
LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor);
|
|
63
63
|
LM_GGML_API void lm_ggml_backend_buffer_clear (lm_ggml_backend_buffer_t buffer, uint8_t value);
|
|
64
64
|
LM_GGML_API bool lm_ggml_backend_buffer_is_host (lm_ggml_backend_buffer_t buffer);
|
|
65
65
|
LM_GGML_API void lm_ggml_backend_buffer_set_usage (lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage);
|
|
@@ -248,7 +248,7 @@ extern "C" {
|
|
|
248
248
|
// preferrably to run on the same backend as the buffer
|
|
249
249
|
lm_ggml_backend_buffer_set_usage(buf_weights, LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
250
250
|
|
|
251
|
-
sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, LM_GGML_DEFAULT_GRAPH_SIZE, false);
|
|
251
|
+
sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, LM_GGML_DEFAULT_GRAPH_SIZE, false, true);
|
|
252
252
|
|
|
253
253
|
// initialize buffers from a max size graph (optional)
|
|
254
254
|
reserve_graph = build_graph(sched, max_batch_size);
|
|
@@ -289,7 +289,7 @@ extern "C" {
|
|
|
289
289
|
typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);
|
|
290
290
|
|
|
291
291
|
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
|
292
|
-
LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
292
|
+
LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
|
293
293
|
LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
|
|
294
294
|
|
|
295
295
|
// Initialize backend buffers from a measure graph
|
|
@@ -158,6 +158,12 @@ typedef sycl::half2 lm_ggml_half2;
|
|
|
158
158
|
|
|
159
159
|
#endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP
|
|
160
160
|
|
|
161
|
+
#ifdef _MSC_VER
|
|
162
|
+
#define LM_GGML_EXTENSION
|
|
163
|
+
#else // _MSC_VER
|
|
164
|
+
#define LM_GGML_EXTENSION __extension__
|
|
165
|
+
#endif // _MSC_VER
|
|
166
|
+
|
|
161
167
|
#define QK4_0 32
|
|
162
168
|
typedef struct {
|
|
163
169
|
lm_ggml_half d; // delta
|
|
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_half) + QK4_0 / 2, "wrong q4_
|
|
|
167
173
|
|
|
168
174
|
#define QK4_1 32
|
|
169
175
|
typedef struct {
|
|
170
|
-
union {
|
|
176
|
+
LM_GGML_EXTENSION union {
|
|
171
177
|
struct {
|
|
172
178
|
lm_ggml_half d; // delta
|
|
173
179
|
lm_ggml_half m; // min
|
|
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_half) + sizeof(uint32_t) + QK
|
|
|
188
194
|
|
|
189
195
|
#define QK5_1 32
|
|
190
196
|
typedef struct {
|
|
191
|
-
union {
|
|
197
|
+
LM_GGML_EXTENSION union {
|
|
192
198
|
struct {
|
|
193
199
|
lm_ggml_half d; // delta
|
|
194
200
|
lm_ggml_half m; // min
|
|
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_half) + QK8_0, "wrong q8_0 bl
|
|
|
209
215
|
|
|
210
216
|
#define QK8_1 32
|
|
211
217
|
typedef struct {
|
|
212
|
-
union {
|
|
218
|
+
LM_GGML_EXTENSION union {
|
|
213
219
|
struct {
|
|
214
220
|
lm_ggml_half d; // delta
|
|
215
221
|
lm_ggml_half s; // d * sum(qs[i])
|
|
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2
|
|
|
250
256
|
typedef struct {
|
|
251
257
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
|
252
258
|
uint8_t qs[QK_K/4]; // quants
|
|
253
|
-
union {
|
|
259
|
+
LM_GGML_EXTENSION union {
|
|
254
260
|
struct {
|
|
255
261
|
lm_ggml_half d; // super-block scale for quantized scales
|
|
256
262
|
lm_ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 +
|
|
|
277
283
|
// weight is represented as x = a * q + b
|
|
278
284
|
// Effectively 4.5 bits per weight
|
|
279
285
|
typedef struct {
|
|
280
|
-
union {
|
|
286
|
+
LM_GGML_EXTENSION union {
|
|
281
287
|
struct {
|
|
282
288
|
lm_ggml_half d; // super-block scale for quantized scales
|
|
283
289
|
lm_ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K
|
|
|
294
300
|
// weight is represented as x = a * q + b
|
|
295
301
|
// Effectively 5.5 bits per weight
|
|
296
302
|
typedef struct {
|
|
297
|
-
union {
|
|
303
|
+
LM_GGML_EXTENSION union {
|
|
298
304
|
struct {
|
|
299
305
|
lm_ggml_half d; // super-block scale for quantized scales
|
|
300
306
|
lm_ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -24,7 +24,7 @@ typedef std::unique_ptr<lm_gguf_context, lm_gguf_context_deleter> lm_gguf_contex
|
|
|
24
24
|
|
|
25
25
|
struct lm_ggml_gallocr_deleter { void operator()(lm_ggml_gallocr_t galloc) { lm_ggml_gallocr_free(galloc); } };
|
|
26
26
|
|
|
27
|
-
typedef std::unique_ptr<
|
|
27
|
+
typedef std::unique_ptr<lm_ggml_gallocr, lm_ggml_gallocr_deleter> lm_ggml_gallocr_ptr;
|
|
28
28
|
|
|
29
29
|
// ggml-backend
|
|
30
30
|
|
|
@@ -133,6 +133,11 @@ extern "C" {
|
|
|
133
133
|
|
|
134
134
|
LM_GGML_BACKEND_API lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void);
|
|
135
135
|
|
|
136
|
+
LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_fp16(const float *, lm_ggml_fp16_t *, int64_t);
|
|
137
|
+
LM_GGML_BACKEND_API void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t *, float *, int64_t);
|
|
138
|
+
LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_bf16(const float *, lm_ggml_bf16_t *, int64_t);
|
|
139
|
+
LM_GGML_BACKEND_API void lm_ggml_cpu_bf16_to_fp32(const lm_ggml_bf16_t *, float *, int64_t);
|
|
140
|
+
|
|
136
141
|
#ifdef __cplusplus
|
|
137
142
|
}
|
|
138
143
|
#endif
|
|
@@ -148,8 +148,14 @@ struct lm_ggml_map_custom2_op_params {
|
|
|
148
148
|
|
|
149
149
|
struct lm_ggml_map_custom3_op_params {
|
|
150
150
|
lm_ggml_custom3_op_t fun;
|
|
151
|
-
int
|
|
152
|
-
void
|
|
151
|
+
int n_tasks;
|
|
152
|
+
void * userdata;
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
struct lm_ggml_custom_op_params {
|
|
156
|
+
lm_ggml_custom_op_t fun;
|
|
157
|
+
int n_tasks;
|
|
158
|
+
void * userdata;
|
|
153
159
|
};
|
|
154
160
|
|
|
155
161
|
// bitset
|
|
@@ -311,29 +317,28 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
|
311
317
|
|
|
312
318
|
// FP16 to FP32 conversion
|
|
313
319
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
|
|
320
|
+
// 16-bit float
|
|
321
|
+
// on Arm, we use __fp16
|
|
322
|
+
// on x86, we use uint16_t
|
|
323
|
+
//
|
|
324
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
|
325
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
|
326
|
+
//
|
|
327
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
323
328
|
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
|
324
329
|
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
|
325
330
|
|
|
326
331
|
#define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
|
327
332
|
|
|
328
333
|
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
|
329
|
-
|
|
334
|
+
__fp16 tmp;
|
|
330
335
|
memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
|
|
331
336
|
return (float)tmp;
|
|
332
337
|
}
|
|
333
338
|
|
|
334
339
|
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
|
335
340
|
lm_ggml_fp16_t res;
|
|
336
|
-
|
|
341
|
+
__fp16 tmp = f;
|
|
337
342
|
memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
|
|
338
343
|
return res;
|
|
339
344
|
}
|
|
@@ -357,8 +362,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
|
357
362
|
#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
|
358
363
|
|
|
359
364
|
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
|
360
|
-
|
|
361
|
-
|
|
365
|
+
float f;
|
|
366
|
+
double d;
|
|
362
367
|
__asm__(
|
|
363
368
|
"mtfprd %0,%2\n"
|
|
364
369
|
"xscvhpdp %0,%0\n"
|
|
@@ -370,8 +375,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
|
370
375
|
}
|
|
371
376
|
|
|
372
377
|
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
|
373
|
-
|
|
374
|
-
|
|
378
|
+
double d;
|
|
379
|
+
lm_ggml_fp16_t r;
|
|
375
380
|
__asm__( /* xscvdphp can work on double or single precision */
|
|
376
381
|
"xscvdphp %0,%2\n"
|
|
377
382
|
"mffprd %1,%0\n" :
|
|
@@ -381,6 +386,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
|
381
386
|
return r;
|
|
382
387
|
}
|
|
383
388
|
|
|
389
|
+
#elif defined(__riscv) && defined(LM_GGML_RV_ZFH)
|
|
390
|
+
|
|
391
|
+
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
|
392
|
+
float f;
|
|
393
|
+
__asm__(
|
|
394
|
+
"fmv.h.x %[f], %[h]\n\t"
|
|
395
|
+
"fcvt.s.h %[f], %[f]"
|
|
396
|
+
: [f] "=&f" (f)
|
|
397
|
+
: [h] "r" (h)
|
|
398
|
+
);
|
|
399
|
+
return f;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
|
403
|
+
lm_ggml_fp16_t res;
|
|
404
|
+
__asm__(
|
|
405
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
|
406
|
+
"fmv.x.h %[h], %[f]"
|
|
407
|
+
: [h] "=&r" (res)
|
|
408
|
+
: [f] "f" (f)
|
|
409
|
+
);
|
|
410
|
+
return res;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
|
414
|
+
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
|
415
|
+
#define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
|
|
416
|
+
#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
|
417
|
+
|
|
384
418
|
#else
|
|
385
419
|
|
|
386
420
|
// FP16 <-> FP32
|
|
@@ -456,7 +490,7 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
|
456
490
|
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
|
457
491
|
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
|
458
492
|
|
|
459
|
-
#endif // defined(__ARM_NEON) && (!defined(
|
|
493
|
+
#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
460
494
|
|
|
461
495
|
// precomputed f32 table for f16 (256 KB)
|
|
462
496
|
// defined in ggml.c, initialized in lm_ggml_init()
|
|
@@ -1,6 +1,70 @@
|
|
|
1
1
|
#ifndef LM_GGML_METAL_IMPL
|
|
2
2
|
#define LM_GGML_METAL_IMPL
|
|
3
3
|
|
|
4
|
+
// kernel parameters for mat-vec threadgroups
|
|
5
|
+
//
|
|
6
|
+
// N_R0: number of src0 rows to process per simdgroup
|
|
7
|
+
// N_SG: number of simdgroups per threadgroup
|
|
8
|
+
//
|
|
9
|
+
// TODO: for optimal performance, become function of the device and work size
|
|
10
|
+
|
|
11
|
+
#define N_R0_Q4_0 4
|
|
12
|
+
#define N_SG_Q4_0 2
|
|
13
|
+
|
|
14
|
+
#define N_R0_Q4_1 4
|
|
15
|
+
#define N_SG_Q4_1 2
|
|
16
|
+
|
|
17
|
+
#define N_R0_Q5_0 4
|
|
18
|
+
#define N_SG_Q5_0 2
|
|
19
|
+
|
|
20
|
+
#define N_R0_Q5_1 4
|
|
21
|
+
#define N_SG_Q5_1 2
|
|
22
|
+
|
|
23
|
+
#define N_R0_Q8_0 4
|
|
24
|
+
#define N_SG_Q8_0 2
|
|
25
|
+
|
|
26
|
+
#define N_R0_Q2_K 4
|
|
27
|
+
#define N_SG_Q2_K 2
|
|
28
|
+
|
|
29
|
+
#define N_R0_Q3_K 2
|
|
30
|
+
#define N_SG_Q3_K 2
|
|
31
|
+
|
|
32
|
+
#define N_R0_Q4_K 4
|
|
33
|
+
#define N_SG_Q4_K 2
|
|
34
|
+
|
|
35
|
+
#define N_R0_Q5_K 2
|
|
36
|
+
#define N_SG_Q5_K 2
|
|
37
|
+
|
|
38
|
+
#define N_R0_Q6_K 1
|
|
39
|
+
#define N_SG_Q6_K 2
|
|
40
|
+
|
|
41
|
+
#define N_R0_IQ1_S 4
|
|
42
|
+
#define N_SG_IQ1_S 2
|
|
43
|
+
|
|
44
|
+
#define N_R0_IQ1_M 4
|
|
45
|
+
#define N_SG_IQ1_M 2
|
|
46
|
+
|
|
47
|
+
#define N_R0_IQ2_XXS 4
|
|
48
|
+
#define N_SG_IQ2_XXS 2
|
|
49
|
+
|
|
50
|
+
#define N_R0_IQ2_XS 4
|
|
51
|
+
#define N_SG_IQ2_XS 2
|
|
52
|
+
|
|
53
|
+
#define N_R0_IQ2_S 4
|
|
54
|
+
#define N_SG_IQ2_S 2
|
|
55
|
+
|
|
56
|
+
#define N_R0_IQ3_XXS 4
|
|
57
|
+
#define N_SG_IQ3_XXS 2
|
|
58
|
+
|
|
59
|
+
#define N_R0_IQ3_S 4
|
|
60
|
+
#define N_SG_IQ3_S 2
|
|
61
|
+
|
|
62
|
+
#define N_R0_IQ4_NL 2
|
|
63
|
+
#define N_SG_IQ4_NL 2
|
|
64
|
+
|
|
65
|
+
#define N_R0_IQ4_XS 2
|
|
66
|
+
#define N_SG_IQ4_XS 2
|
|
67
|
+
|
|
4
68
|
// kernel argument structs
|
|
5
69
|
//
|
|
6
70
|
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
|
|
@@ -143,6 +207,10 @@ typedef struct {
|
|
|
143
207
|
float attn_factor;
|
|
144
208
|
float beta_fast;
|
|
145
209
|
float beta_slow;
|
|
210
|
+
int32_t sect_0;
|
|
211
|
+
int32_t sect_1;
|
|
212
|
+
int32_t sect_2;
|
|
213
|
+
int32_t sect_3;
|
|
146
214
|
} lm_ggml_metal_kargs_rope;
|
|
147
215
|
|
|
148
216
|
typedef struct {
|
|
@@ -155,9 +223,12 @@ typedef struct {
|
|
|
155
223
|
int32_t ne11;
|
|
156
224
|
int32_t ne_12_2; // assume K and V are same shape
|
|
157
225
|
int32_t ne_12_3;
|
|
158
|
-
uint64_t
|
|
159
|
-
uint64_t
|
|
160
|
-
uint64_t
|
|
226
|
+
uint64_t nb11;
|
|
227
|
+
uint64_t nb12;
|
|
228
|
+
uint64_t nb13;
|
|
229
|
+
uint64_t nb21;
|
|
230
|
+
uint64_t nb22;
|
|
231
|
+
uint64_t nb23;
|
|
161
232
|
uint64_t nb31;
|
|
162
233
|
int32_t ne1;
|
|
163
234
|
int32_t ne2;
|
|
@@ -232,21 +303,42 @@ typedef struct {
|
|
|
232
303
|
} lm_ggml_metal_kargs_mul_mv_ext;
|
|
233
304
|
|
|
234
305
|
typedef struct {
|
|
235
|
-
int32_t
|
|
236
|
-
int32_t
|
|
237
|
-
uint64_t
|
|
306
|
+
int32_t ne10;
|
|
307
|
+
int32_t ne11; // n_expert_used (bcast)
|
|
308
|
+
uint64_t nb11;
|
|
309
|
+
uint64_t nb12;
|
|
310
|
+
int32_t neh11; // n_tokens
|
|
311
|
+
uint64_t nbh11;
|
|
312
|
+
int32_t ne20; // n_expert_used
|
|
313
|
+
uint64_t nb21;
|
|
314
|
+
} lm_ggml_metal_kargs_mul_mm_id_map0;
|
|
315
|
+
|
|
316
|
+
typedef struct {
|
|
317
|
+
int32_t ne20; // n_expert_used
|
|
318
|
+
int32_t neh0;
|
|
319
|
+
int32_t neh1;
|
|
320
|
+
uint64_t nbh1;
|
|
321
|
+
uint64_t nbh2;
|
|
322
|
+
int32_t ne0;
|
|
323
|
+
uint64_t nb1;
|
|
324
|
+
uint64_t nb2;
|
|
325
|
+
} lm_ggml_metal_kargs_mul_mm_id_map1;
|
|
326
|
+
|
|
327
|
+
typedef struct {
|
|
238
328
|
int32_t ne00;
|
|
239
329
|
int32_t ne02;
|
|
240
330
|
uint64_t nb01;
|
|
241
331
|
uint64_t nb02;
|
|
242
|
-
|
|
243
|
-
int32_t
|
|
244
|
-
|
|
245
|
-
uint64_t
|
|
246
|
-
uint64_t
|
|
247
|
-
uint64_t
|
|
248
|
-
int32_t
|
|
249
|
-
int32_t
|
|
332
|
+
uint64_t nb03;
|
|
333
|
+
int32_t neh12;
|
|
334
|
+
uint64_t nbh10;
|
|
335
|
+
uint64_t nbh11;
|
|
336
|
+
uint64_t nbh12;
|
|
337
|
+
uint64_t nbh13;
|
|
338
|
+
int32_t neh0;
|
|
339
|
+
int32_t neh1;
|
|
340
|
+
int16_t r2;
|
|
341
|
+
int16_t r3;
|
|
250
342
|
} lm_ggml_metal_kargs_mul_mm_id;
|
|
251
343
|
|
|
252
344
|
typedef struct {
|