cui-llama.rn 1.6.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -7
- package/android/src/main/CMakeLists.txt +16 -11
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -1
- package/android/src/main/jni.cpp +20 -4
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/LICENSE +21 -0
- package/cpp/chat.cpp +1 -1
- package/cpp/common.cpp +17 -2
- package/cpp/common.h +7 -3
- package/cpp/ggml-alloc.c +4 -1
- package/cpp/ggml-cpp.h +1 -1
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
- package/cpp/ggml-cpu/common.h +72 -0
- package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -101
- package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +109 -42
- package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +3 -0
- package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +246 -160
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
- package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
- package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
- package/cpp/ggml-cpu.h +5 -0
- package/cpp/ggml-impl.h +16 -9
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +492 -47
- package/cpp/ggml.c +134 -244
- package/cpp/ggml.h +61 -94
- package/cpp/json-schema-to-grammar.cpp +3 -0
- package/cpp/llama-arch.cpp +46 -17
- package/cpp/llama-arch.h +9 -0
- package/cpp/llama-batch.cpp +5 -1
- package/cpp/llama-batch.h +2 -1
- package/cpp/llama-chat.cpp +31 -10
- package/cpp/llama-chat.h +3 -2
- package/cpp/llama-context.cpp +104 -489
- package/cpp/llama-context.h +14 -30
- package/cpp/llama-graph.cpp +69 -62
- package/cpp/llama-graph.h +21 -18
- package/cpp/llama-hparams.h +5 -0
- package/cpp/llama-kv-cache.cpp +1497 -391
- package/cpp/llama-kv-cache.h +272 -80
- package/cpp/llama-memory.h +11 -1
- package/cpp/llama-model.cpp +502 -176
- package/cpp/llama-model.h +13 -3
- package/cpp/llama-sampling.cpp +2 -1
- package/cpp/llama-vocab.cpp +8 -1
- package/cpp/llama.h +14 -11
- package/cpp/rn-llama.cpp +20 -172
- package/cpp/rn-llama.h +1 -5
- package/ios/CMakeLists.txt +13 -10
- package/ios/RNLlama.h +6 -0
- package/ios/RNLlama.mm +5 -0
- package/ios/RNLlamaContext.mm +26 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +4 -0
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +5 -0
- package/cpp/binary-ops.h +0 -16
- package/cpp/ops.h +0 -128
- package/cpp/simd-mappings.h +0 -888
- package/cpp/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
- /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
- /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
- /package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +0 -0
- /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
- /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
- /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
- /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
- /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
- /package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -0
- /package/cpp/{vec.h → ggml-cpu/vec.h} +0 -0
package/cpp/ggml.h
CHANGED
@@ -394,8 +394,8 @@ extern "C" {
|
|
394
394
|
|
395
395
|
// precision
|
396
396
|
enum lm_ggml_prec {
|
397
|
-
LM_GGML_PREC_DEFAULT,
|
398
|
-
LM_GGML_PREC_F32,
|
397
|
+
LM_GGML_PREC_DEFAULT = 0, // stored as lm_ggml_tensor.op_params, 0 by default
|
398
|
+
LM_GGML_PREC_F32 = 10,
|
399
399
|
};
|
400
400
|
|
401
401
|
// model file types
|
@@ -482,6 +482,7 @@ extern "C" {
|
|
482
482
|
LM_GGML_OP_CONV_TRANSPOSE_1D,
|
483
483
|
LM_GGML_OP_IM2COL,
|
484
484
|
LM_GGML_OP_IM2COL_BACK,
|
485
|
+
LM_GGML_OP_CONV_2D_DW,
|
485
486
|
LM_GGML_OP_CONV_TRANSPOSE_2D,
|
486
487
|
LM_GGML_OP_POOL_1D,
|
487
488
|
LM_GGML_OP_POOL_2D,
|
@@ -508,17 +509,12 @@ extern "C" {
|
|
508
509
|
|
509
510
|
LM_GGML_OP_UNARY,
|
510
511
|
|
511
|
-
LM_GGML_OP_MAP_UNARY,
|
512
|
-
LM_GGML_OP_MAP_BINARY,
|
513
|
-
|
514
|
-
LM_GGML_OP_MAP_CUSTOM1_F32,
|
515
|
-
LM_GGML_OP_MAP_CUSTOM2_F32,
|
516
|
-
LM_GGML_OP_MAP_CUSTOM3_F32,
|
517
|
-
|
518
512
|
LM_GGML_OP_MAP_CUSTOM1,
|
519
513
|
LM_GGML_OP_MAP_CUSTOM2,
|
520
514
|
LM_GGML_OP_MAP_CUSTOM3,
|
521
515
|
|
516
|
+
LM_GGML_OP_CUSTOM,
|
517
|
+
|
522
518
|
LM_GGML_OP_CROSS_ENTROPY_LOSS,
|
523
519
|
LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
524
520
|
LM_GGML_OP_OPT_STEP_ADAMW,
|
@@ -683,6 +679,9 @@ extern "C" {
|
|
683
679
|
LM_GGML_API bool lm_ggml_is_contiguous_1(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 1
|
684
680
|
LM_GGML_API bool lm_ggml_is_contiguous_2(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 2
|
685
681
|
|
682
|
+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
683
|
+
LM_GGML_API bool lm_ggml_is_contiguous_channels(const struct lm_ggml_tensor * tensor);
|
684
|
+
|
686
685
|
LM_GGML_API bool lm_ggml_are_same_shape (const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
|
687
686
|
LM_GGML_API bool lm_ggml_are_same_stride(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
|
688
687
|
|
@@ -1666,7 +1665,7 @@ extern "C" {
|
|
1666
1665
|
struct lm_ggml_tensor * a,
|
1667
1666
|
struct lm_ggml_tensor * b);
|
1668
1667
|
|
1669
|
-
// depthwise
|
1668
|
+
// depthwise (via im2col and mul_mat)
|
1670
1669
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
|
1671
1670
|
struct lm_ggml_context * ctx,
|
1672
1671
|
struct lm_ggml_tensor * a, // convolution kernel
|
@@ -1678,6 +1677,22 @@ extern "C" {
|
|
1678
1677
|
int d0, // dilation dimension 0
|
1679
1678
|
int d1); // dilation dimension 1
|
1680
1679
|
|
1680
|
+
// Depthwise 2D convolution
|
1681
|
+
// may be faster than lm_ggml_conv_2d_dw, but not available in all backends
|
1682
|
+
// a: KW KH 1 C convolution kernel
|
1683
|
+
// b: W H C N input data
|
1684
|
+
// res: W_out H_out C N
|
1685
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw_direct(
|
1686
|
+
struct lm_ggml_context * ctx,
|
1687
|
+
struct lm_ggml_tensor * a,
|
1688
|
+
struct lm_ggml_tensor * b,
|
1689
|
+
int stride0,
|
1690
|
+
int stride1,
|
1691
|
+
int pad0,
|
1692
|
+
int pad1,
|
1693
|
+
int dilation0,
|
1694
|
+
int dilation1);
|
1695
|
+
|
1681
1696
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
|
1682
1697
|
struct lm_ggml_context * ctx,
|
1683
1698
|
struct lm_ggml_tensor * a,
|
@@ -1723,24 +1738,29 @@ extern "C" {
|
|
1723
1738
|
float p0,
|
1724
1739
|
float p1);
|
1725
1740
|
|
1726
|
-
|
1741
|
+
enum lm_ggml_scale_mode {
|
1742
|
+
LM_GGML_SCALE_MODE_NEAREST = 0,
|
1743
|
+
LM_GGML_SCALE_MODE_BILINEAR = 1,
|
1744
|
+
};
|
1745
|
+
|
1746
|
+
// interpolate
|
1727
1747
|
// multiplies ne0 and ne1 by scale factor
|
1728
|
-
// used in stable-diffusion
|
1729
1748
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale(
|
1730
1749
|
struct lm_ggml_context * ctx,
|
1731
1750
|
struct lm_ggml_tensor * a,
|
1732
|
-
int scale_factor
|
1751
|
+
int scale_factor,
|
1752
|
+
enum lm_ggml_scale_mode mode);
|
1733
1753
|
|
1734
|
-
//
|
1735
|
-
//
|
1736
|
-
// used in tortoise.cpp
|
1754
|
+
// interpolate
|
1755
|
+
// interpolate scale to specified dimensions
|
1737
1756
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale_ext(
|
1738
1757
|
struct lm_ggml_context * ctx,
|
1739
1758
|
struct lm_ggml_tensor * a,
|
1740
1759
|
int ne0,
|
1741
1760
|
int ne1,
|
1742
1761
|
int ne2,
|
1743
|
-
int ne3
|
1762
|
+
int ne3,
|
1763
|
+
enum lm_ggml_scale_mode mode);
|
1744
1764
|
|
1745
1765
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1746
1766
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad(
|
@@ -1917,83 +1937,6 @@ extern "C" {
|
|
1917
1937
|
|
1918
1938
|
// custom operators
|
1919
1939
|
|
1920
|
-
typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
|
1921
|
-
typedef void (*lm_ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1922
|
-
|
1923
|
-
typedef void (*lm_ggml_custom1_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
|
1924
|
-
typedef void (*lm_ggml_custom2_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
|
1925
|
-
typedef void (*lm_ggml_custom3_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
|
1926
|
-
|
1927
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_f32(
|
1928
|
-
struct lm_ggml_context * ctx,
|
1929
|
-
struct lm_ggml_tensor * a,
|
1930
|
-
lm_ggml_unary_op_f32_t fun),
|
1931
|
-
"use lm_ggml_map_custom1 instead");
|
1932
|
-
|
1933
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_inplace_f32(
|
1934
|
-
struct lm_ggml_context * ctx,
|
1935
|
-
struct lm_ggml_tensor * a,
|
1936
|
-
lm_ggml_unary_op_f32_t fun),
|
1937
|
-
"use lm_ggml_map_custom1_inplace instead");
|
1938
|
-
|
1939
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_f32(
|
1940
|
-
struct lm_ggml_context * ctx,
|
1941
|
-
struct lm_ggml_tensor * a,
|
1942
|
-
struct lm_ggml_tensor * b,
|
1943
|
-
lm_ggml_binary_op_f32_t fun),
|
1944
|
-
"use lm_ggml_map_custom2 instead");
|
1945
|
-
|
1946
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_inplace_f32(
|
1947
|
-
struct lm_ggml_context * ctx,
|
1948
|
-
struct lm_ggml_tensor * a,
|
1949
|
-
struct lm_ggml_tensor * b,
|
1950
|
-
lm_ggml_binary_op_f32_t fun),
|
1951
|
-
"use lm_ggml_map_custom2_inplace instead");
|
1952
|
-
|
1953
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_f32(
|
1954
|
-
struct lm_ggml_context * ctx,
|
1955
|
-
struct lm_ggml_tensor * a,
|
1956
|
-
lm_ggml_custom1_op_f32_t fun),
|
1957
|
-
"use lm_ggml_map_custom1 instead");
|
1958
|
-
|
1959
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_inplace_f32(
|
1960
|
-
struct lm_ggml_context * ctx,
|
1961
|
-
struct lm_ggml_tensor * a,
|
1962
|
-
lm_ggml_custom1_op_f32_t fun),
|
1963
|
-
"use lm_ggml_map_custom1_inplace instead");
|
1964
|
-
|
1965
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_f32(
|
1966
|
-
struct lm_ggml_context * ctx,
|
1967
|
-
struct lm_ggml_tensor * a,
|
1968
|
-
struct lm_ggml_tensor * b,
|
1969
|
-
lm_ggml_custom2_op_f32_t fun),
|
1970
|
-
"use lm_ggml_map_custom2 instead");
|
1971
|
-
|
1972
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_inplace_f32(
|
1973
|
-
struct lm_ggml_context * ctx,
|
1974
|
-
struct lm_ggml_tensor * a,
|
1975
|
-
struct lm_ggml_tensor * b,
|
1976
|
-
lm_ggml_custom2_op_f32_t fun),
|
1977
|
-
"use lm_ggml_map_custom2_inplace instead");
|
1978
|
-
|
1979
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_f32(
|
1980
|
-
struct lm_ggml_context * ctx,
|
1981
|
-
struct lm_ggml_tensor * a,
|
1982
|
-
struct lm_ggml_tensor * b,
|
1983
|
-
struct lm_ggml_tensor * c,
|
1984
|
-
lm_ggml_custom3_op_f32_t fun),
|
1985
|
-
"use lm_ggml_map_custom3 instead");
|
1986
|
-
|
1987
|
-
LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_inplace_f32(
|
1988
|
-
struct lm_ggml_context * ctx,
|
1989
|
-
struct lm_ggml_tensor * a,
|
1990
|
-
struct lm_ggml_tensor * b,
|
1991
|
-
struct lm_ggml_tensor * c,
|
1992
|
-
lm_ggml_custom3_op_f32_t fun),
|
1993
|
-
"use lm_ggml_map_custom3_inplace instead");
|
1994
|
-
|
1995
|
-
// custom operators v2
|
1996
|
-
|
1997
1940
|
typedef void (*lm_ggml_custom1_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, int ith, int nth, void * userdata);
|
1998
1941
|
typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
|
1999
1942
|
typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
|
@@ -2049,6 +1992,30 @@ extern "C" {
|
|
2049
1992
|
int n_tasks,
|
2050
1993
|
void * userdata);
|
2051
1994
|
|
1995
|
+
typedef void (*lm_ggml_custom_op_t)(struct lm_ggml_tensor * dst , int ith, int nth, void * userdata);
|
1996
|
+
|
1997
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_4d(
|
1998
|
+
struct lm_ggml_context * ctx,
|
1999
|
+
enum lm_ggml_type type,
|
2000
|
+
int64_t ne0,
|
2001
|
+
int64_t ne1,
|
2002
|
+
int64_t ne2,
|
2003
|
+
int64_t ne3,
|
2004
|
+
struct lm_ggml_tensor ** args,
|
2005
|
+
int n_args,
|
2006
|
+
lm_ggml_custom_op_t fun,
|
2007
|
+
int n_tasks,
|
2008
|
+
void * userdata);
|
2009
|
+
|
2010
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_inplace(
|
2011
|
+
struct lm_ggml_context * ctx,
|
2012
|
+
struct lm_ggml_tensor * a,
|
2013
|
+
struct lm_ggml_tensor ** args,
|
2014
|
+
int n_args,
|
2015
|
+
lm_ggml_custom_op_t fun,
|
2016
|
+
int n_tasks,
|
2017
|
+
void * userdata);
|
2018
|
+
|
2052
2019
|
// loss function
|
2053
2020
|
|
2054
2021
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_cross_entropy_loss(
|
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
|
|
16
16
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
17
17
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
18
18
|
|
19
|
+
if (max_items == 0) {
|
20
|
+
return "";
|
21
|
+
}
|
19
22
|
if (min_items == 0 && max_items == 1) {
|
20
23
|
return item_rule + "?";
|
21
24
|
}
|
package/cpp/llama-arch.cpp
CHANGED
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
19
19
|
{ LLM_ARCH_REFACT, "refact" },
|
20
20
|
{ LLM_ARCH_BERT, "bert" },
|
21
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
22
|
+
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
22
23
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
23
24
|
{ LLM_ARCH_BLOOM, "bloom" },
|
24
25
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
@@ -54,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
54
55
|
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
55
56
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
56
57
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
58
|
+
{ LLM_ARCH_GLM4, "glm4" },
|
57
59
|
{ LLM_ARCH_BITNET, "bitnet" },
|
58
60
|
{ LLM_ARCH_T5, "t5" },
|
59
61
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
105
107
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
106
108
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
107
109
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
110
|
+
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
108
111
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
109
112
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
110
113
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
@@ -139,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
139
142
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
140
143
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
141
144
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
145
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
146
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
142
147
|
|
143
148
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
144
149
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
@@ -469,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
469
474
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
470
475
|
},
|
471
476
|
},
|
477
|
+
{
|
478
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
479
|
+
{
|
480
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
481
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
482
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
483
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
484
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
485
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
486
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
487
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
488
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
489
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
490
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
491
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
492
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
493
|
+
},
|
494
|
+
},
|
472
495
|
{
|
473
496
|
LLM_ARCH_JINA_BERT_V2,
|
474
497
|
{
|
@@ -1102,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1102
1125
|
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1103
1126
|
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1104
1127
|
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1128
|
+
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
1129
|
+
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
1105
1130
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1106
1131
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1107
1132
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
@@ -1152,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1152
1177
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1153
1178
|
},
|
1154
1179
|
},
|
1180
|
+
{
|
1181
|
+
LLM_ARCH_GLM4,
|
1182
|
+
{
|
1183
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1184
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1185
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1186
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1187
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1188
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1189
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1190
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1191
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1192
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1193
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1194
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1195
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
1196
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
1197
|
+
},
|
1198
|
+
},
|
1155
1199
|
{
|
1156
1200
|
LLM_ARCH_BITNET,
|
1157
1201
|
{
|
@@ -1543,23 +1587,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
1543
1587
|
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1544
1588
|
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1545
1589
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1546
|
-
{
|
1547
|
-
{
|
1548
|
-
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1549
|
-
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1550
|
-
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1551
|
-
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1552
|
-
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1553
|
-
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1554
|
-
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1555
|
-
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1556
|
-
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1557
|
-
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1558
|
-
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1559
|
-
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1560
|
-
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1561
|
-
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1562
|
-
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1590
|
+
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1591
|
+
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1563
1592
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1564
1593
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1565
1594
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
package/cpp/llama-arch.h
CHANGED
@@ -23,6 +23,7 @@ enum llm_arch {
|
|
23
23
|
LLM_ARCH_REFACT,
|
24
24
|
LLM_ARCH_BERT,
|
25
25
|
LLM_ARCH_NOMIC_BERT,
|
26
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
26
27
|
LLM_ARCH_JINA_BERT_V2,
|
27
28
|
LLM_ARCH_BLOOM,
|
28
29
|
LLM_ARCH_STABLELM,
|
@@ -58,6 +59,7 @@ enum llm_arch {
|
|
58
59
|
LLM_ARCH_DEEPSEEK,
|
59
60
|
LLM_ARCH_DEEPSEEK2,
|
60
61
|
LLM_ARCH_CHATGLM,
|
62
|
+
LLM_ARCH_GLM4,
|
61
63
|
LLM_ARCH_BITNET,
|
62
64
|
LLM_ARCH_T5,
|
63
65
|
LLM_ARCH_T5ENCODER,
|
@@ -109,6 +111,7 @@ enum llm_kv {
|
|
109
111
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
110
112
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
111
113
|
LLM_KV_EXPERT_GATING_FUNC,
|
114
|
+
LLM_KV_MOE_EVERY_N_LAYERS,
|
112
115
|
LLM_KV_POOLING_TYPE,
|
113
116
|
LLM_KV_LOGIT_SCALE,
|
114
117
|
LLM_KV_DECODER_START_TOKEN_ID,
|
@@ -143,6 +146,8 @@ enum llm_kv {
|
|
143
146
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
144
147
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
145
148
|
LLM_KV_ATTENTION_SCALE,
|
149
|
+
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
150
|
+
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
146
151
|
|
147
152
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
148
153
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
@@ -256,6 +261,8 @@ enum llm_tensor {
|
|
256
261
|
LLM_TENSOR_ATTN_Q_NORM,
|
257
262
|
LLM_TENSOR_ATTN_K_NORM,
|
258
263
|
LLM_TENSOR_LAYER_OUT_NORM,
|
264
|
+
LLM_TENSOR_POST_ATTN_NORM,
|
265
|
+
LLM_TENSOR_POST_MLP_NORM,
|
259
266
|
LLM_TENSOR_SSM_IN,
|
260
267
|
LLM_TENSOR_SSM_CONV1D,
|
261
268
|
LLM_TENSOR_SSM_X,
|
@@ -303,6 +310,8 @@ enum llm_tensor {
|
|
303
310
|
LLM_TENSOR_ATTN_Q_B,
|
304
311
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
305
312
|
LLM_TENSOR_ATTN_KV_B,
|
313
|
+
LLM_TENSOR_ATTN_K_B,
|
314
|
+
LLM_TENSOR_ATTN_V_B,
|
306
315
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
307
316
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
308
317
|
LLM_TENSOR_ATTN_SUB_NORM,
|
package/cpp/llama-batch.cpp
CHANGED
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
|
189
189
|
return ubatch;
|
190
190
|
}
|
191
191
|
|
192
|
-
|
192
|
+
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
193
193
|
LM_GGML_ASSERT(batch.n_tokens >= 0);
|
194
194
|
this->batch = &batch;
|
195
195
|
this->n_embd = n_embd;
|
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
203
203
|
for (size_t i = 0; i < n_tokens; ++i) {
|
204
204
|
ids[i] = i;
|
205
205
|
}
|
206
|
+
|
206
207
|
if (simple_split) {
|
207
208
|
seq.resize(1);
|
208
209
|
llama_sbatch_seq & s = seq[0];
|
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
212
213
|
s.length = n_tokens;
|
213
214
|
return;
|
214
215
|
}
|
216
|
+
|
215
217
|
std::sort(ids.begin(), ids.end(),
|
216
218
|
[&batch](size_t a, size_t b) {
|
217
219
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
239
241
|
return n_seq_a > n_seq_b;
|
240
242
|
}
|
241
243
|
);
|
244
|
+
|
242
245
|
// init seq
|
243
246
|
llama_sbatch_seq * last_seq = nullptr;
|
244
247
|
|
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
262
265
|
seq.push_back(new_seq);
|
263
266
|
last_seq = &seq.back();
|
264
267
|
}
|
268
|
+
|
265
269
|
// keep shared prompts first at the end, then sort by length descending.
|
266
270
|
std::sort(seq.begin(), seq.end(),
|
267
271
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
package/cpp/llama-batch.h
CHANGED
@@ -70,7 +70,8 @@ struct llama_sbatch {
|
|
70
70
|
// sequence-wise split
|
71
71
|
llama_ubatch split_seq(size_t n_ubatch);
|
72
72
|
|
73
|
-
|
73
|
+
llama_sbatch() = default;
|
74
|
+
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
74
75
|
};
|
75
76
|
|
76
77
|
// temporary allocate memory for the input batch if needed
|
package/cpp/llama-chat.cpp
CHANGED
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
50
50
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
51
51
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
52
52
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
53
|
-
{ "chatglm3",
|
54
|
-
{ "chatglm4",
|
53
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
54
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
55
55
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
56
56
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
57
57
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
62
62
|
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
63
63
|
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
64
64
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
65
|
+
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
65
66
|
};
|
66
67
|
|
67
68
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
81
82
|
if (tmpl_contains("<|im_start|>")) {
|
82
83
|
return tmpl_contains("<|im_sep|>")
|
83
84
|
? LLM_CHAT_TEMPLATE_PHI_4
|
84
|
-
:
|
85
|
+
: tmpl_contains("<end_of_utterance>")
|
86
|
+
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
87
|
+
: LLM_CHAT_TEMPLATE_CHATML;
|
85
88
|
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
86
89
|
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
87
90
|
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
@@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
119
122
|
}
|
120
123
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
121
124
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
125
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
126
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
122
127
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
123
128
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
129
|
+
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
130
|
+
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
124
131
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
125
132
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
126
133
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
@@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
149
156
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
150
157
|
} else if (tmpl_contains("[gMASK]sop")) {
|
151
158
|
// chatglm3-6b
|
152
|
-
return
|
153
|
-
} else if (tmpl_contains("[gMASK]<sop>")) {
|
154
|
-
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
159
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
155
160
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
156
161
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
157
162
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
@@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
|
|
432
437
|
if (add_ass) {
|
433
438
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
434
439
|
}
|
435
|
-
} else if (tmpl ==
|
440
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
436
441
|
// chatglm3-6b
|
437
442
|
ss << "[gMASK]" << "sop";
|
438
443
|
for (auto message : chat) {
|
@@ -442,14 +447,14 @@ int32_t llm_chat_apply_template(
|
|
442
447
|
if (add_ass) {
|
443
448
|
ss << "<|assistant|>";
|
444
449
|
}
|
445
|
-
} else if (tmpl ==
|
450
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
446
451
|
ss << "[gMASK]" << "<sop>";
|
447
452
|
for (auto message : chat) {
|
448
453
|
std::string role(message->role);
|
449
454
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
450
455
|
}
|
451
456
|
if (add_ass) {
|
452
|
-
ss << "<|assistant
|
457
|
+
ss << "<|assistant|>\n";
|
453
458
|
}
|
454
459
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
455
460
|
for (auto message : chat) {
|
@@ -620,7 +625,23 @@ int32_t llm_chat_apply_template(
|
|
620
625
|
if (add_ass) {
|
621
626
|
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
622
627
|
}
|
623
|
-
}
|
628
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
629
|
+
// SmolVLM
|
630
|
+
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
631
|
+
for (auto message : chat) {
|
632
|
+
std::string role(message->role);
|
633
|
+
if (role == "system") {
|
634
|
+
ss << message->content << "\n\n";
|
635
|
+
} else if (role == "user") {
|
636
|
+
ss << "User: " << message->content << "<end_of_utterance>\n";
|
637
|
+
} else {
|
638
|
+
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
639
|
+
}
|
640
|
+
}
|
641
|
+
if (add_ass) {
|
642
|
+
ss << "Assistant:";
|
643
|
+
}
|
644
|
+
} else {
|
624
645
|
// template not supported
|
625
646
|
return -1;
|
626
647
|
}
|
package/cpp/llama-chat.h
CHANGED
@@ -29,8 +29,8 @@ enum llm_chat_template {
|
|
29
29
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
30
30
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
31
31
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
32
|
-
|
33
|
-
|
32
|
+
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
33
|
+
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
34
34
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
35
35
|
LLM_CHAT_TEMPLATE_MINICPM,
|
36
36
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
@@ -41,6 +41,7 @@ enum llm_chat_template {
|
|
41
41
|
LLM_CHAT_TEMPLATE_YANDEX,
|
42
42
|
LLM_CHAT_TEMPLATE_BAILING,
|
43
43
|
LLM_CHAT_TEMPLATE_LLAMA4,
|
44
|
+
LLM_CHAT_TEMPLATE_SMOLVLM,
|
44
45
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
45
46
|
};
|
46
47
|
|