@novastera-oss/llamarn 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +4 -3
- package/cpp/llama.cpp/common/arg.cpp +45 -1
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +18 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
- package/cpp/llama.cpp/include/llama.h +15 -7
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
- package/cpp/llama.cpp/src/llama-arch.h +5 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
- package/cpp/llama.cpp/src/llama-batch.h +24 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
- package/cpp/llama.cpp/src/llama-chat.h +2 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
- package/cpp/llama.cpp/src/llama-graph.h +147 -72
- package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
- package/cpp/llama.cpp/src/llama-hparams.h +10 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
- package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
- package/cpp/llama.cpp/src/llama-model.h +3 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
- package/cpp/llama.cpp/src/llama-vocab.h +2 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/common.h +18 -4
- package/ios/include/llama.h +15 -7
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
|
@@ -7,7 +7,6 @@ import pathlib
|
|
|
7
7
|
import re
|
|
8
8
|
|
|
9
9
|
import requests
|
|
10
|
-
import sys
|
|
11
10
|
import json
|
|
12
11
|
import shutil
|
|
13
12
|
import argparse
|
|
@@ -69,8 +68,7 @@ args = parser.parse_args()
|
|
|
69
68
|
hf_token = args.hf_token if args.hf_token is not None else hf_token
|
|
70
69
|
|
|
71
70
|
if hf_token is None:
|
|
72
|
-
logger.
|
|
73
|
-
sys.exit(1)
|
|
71
|
+
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
|
74
72
|
|
|
75
73
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
|
76
74
|
# will be updated with time - contributions welcome
|
|
@@ -131,6 +129,7 @@ models = [
|
|
|
131
129
|
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
|
|
132
130
|
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
|
133
131
|
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
|
132
|
+
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
|
134
133
|
]
|
|
135
134
|
|
|
136
135
|
# some models are known to be broken upstream, so we will skip them as exceptions
|
|
@@ -146,11 +145,12 @@ pre_computed_hashes = [
|
|
|
146
145
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
|
147
146
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
|
148
147
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
|
148
|
+
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
|
149
149
|
]
|
|
150
150
|
|
|
151
151
|
|
|
152
152
|
def download_file_with_auth(url, token, save_path):
|
|
153
|
-
headers = {"Authorization": f"Bearer {token}"}
|
|
153
|
+
headers = {"Authorization": f"Bearer {token}"} if token else None
|
|
154
154
|
response = sess.get(url, headers=headers)
|
|
155
155
|
response.raise_for_status()
|
|
156
156
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
|
@@ -231,7 +231,7 @@ for model in models:
|
|
|
231
231
|
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
|
232
232
|
|
|
233
233
|
src_ifs = ""
|
|
234
|
-
for model in [*
|
|
234
|
+
for model in [*pre_computed_hashes, *all_models]:
|
|
235
235
|
name = model["name"]
|
|
236
236
|
tokt = model["tokt"]
|
|
237
237
|
chkhsh = model.get("chkhsh")
|
|
@@ -239,11 +239,6 @@ for model in [*all_models, *pre_computed_hashes]:
|
|
|
239
239
|
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
|
240
240
|
continue
|
|
241
241
|
|
|
242
|
-
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
|
243
|
-
if not os.path.exists(f"models/tokenizers/{name}"):
|
|
244
|
-
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
|
245
|
-
continue
|
|
246
|
-
|
|
247
242
|
# create the tokenizer
|
|
248
243
|
if chkhsh is not None:
|
|
249
244
|
# if the model has a pre-computed hash, use it
|
|
@@ -253,15 +248,19 @@ for model in [*all_models, *pre_computed_hashes]:
|
|
|
253
248
|
chkhsh = existing_models[name]
|
|
254
249
|
else:
|
|
255
250
|
# otherwise, compute the hash of the tokenizer
|
|
251
|
+
|
|
252
|
+
# Fail if the tokenizer folder with config does not exist or there are other download issues previously
|
|
253
|
+
if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
|
|
254
|
+
raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
|
|
255
|
+
|
|
256
256
|
try:
|
|
257
257
|
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
|
258
258
|
if name == "t5":
|
|
259
259
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
|
260
260
|
else:
|
|
261
261
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
|
262
|
-
except
|
|
263
|
-
|
|
264
|
-
continue # Skip to the next model if the tokenizer can't be loaded
|
|
262
|
+
except Exception as e:
|
|
263
|
+
raise OSError(f"Error loading tokenizer for model {name}.") from e
|
|
265
264
|
|
|
266
265
|
chktok = tokenizer.encode(CHK_TXT)
|
|
267
266
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
|
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
-
option(GGML_NNPA "ggml: enable nnpa"
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
|
135
135
|
|
|
136
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
137
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|
|
174
174
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
175
175
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
176
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
177
|
+
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
178
|
+
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
177
179
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
178
180
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
179
181
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -181,6 +183,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
|
|
181
183
|
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
|
182
184
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
183
185
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
186
|
+
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
187
|
+
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
184
188
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
185
189
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
186
190
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -270,6 +274,7 @@ set(GGML_PUBLIC_HEADERS
|
|
|
270
274
|
include/ggml-rpc.h
|
|
271
275
|
include/ggml-sycl.h
|
|
272
276
|
include/ggml-vulkan.h
|
|
277
|
+
include/ggml-webgpu.h
|
|
273
278
|
include/gguf.h)
|
|
274
279
|
|
|
275
280
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
@@ -1,94 +1,130 @@
|
|
|
1
|
-
|
|
2
|
-
@GGML_VARIABLES_EXPANDED@
|
|
3
|
-
|
|
4
1
|
@PACKAGE_INIT@
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
|
8
|
-
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
|
9
|
-
|
|
10
|
-
find_package(Threads REQUIRED)
|
|
11
|
-
|
|
12
|
-
find_library(GGML_LIBRARY ggml
|
|
13
|
-
REQUIRED
|
|
14
|
-
HINTS ${GGML_LIB_DIR}
|
|
15
|
-
NO_CMAKE_FIND_ROOT_PATH)
|
|
16
|
-
|
|
17
|
-
add_library(ggml::ggml UNKNOWN IMPORTED)
|
|
18
|
-
set_target_properties(ggml::ggml
|
|
19
|
-
PROPERTIES
|
|
20
|
-
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
|
21
|
-
|
|
22
|
-
find_library(GGML_BASE_LIBRARY ggml-base
|
|
23
|
-
REQUIRED
|
|
24
|
-
HINTS ${GGML_LIB_DIR}
|
|
25
|
-
NO_CMAKE_FIND_ROOT_PATH)
|
|
26
|
-
|
|
27
|
-
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
|
28
|
-
set_target_properties(ggml::ggml-base
|
|
29
|
-
PROPERTIES
|
|
30
|
-
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
|
3
|
+
@GGML_VARIABLES_EXPANDED@
|
|
31
4
|
|
|
5
|
+
# Find all dependencies before creating any target.
|
|
6
|
+
include(CMakeFindDependencyMacro)
|
|
7
|
+
find_dependency(Threads)
|
|
32
8
|
if (NOT GGML_SHARED_LIB)
|
|
9
|
+
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
|
10
|
+
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
|
11
|
+
|
|
33
12
|
if (APPLE AND GGML_ACCELERATE)
|
|
34
|
-
find_library(ACCELERATE_FRAMEWORK Accelerate
|
|
13
|
+
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
|
14
|
+
if(NOT ACCELERATE_FRAMEWORK)
|
|
15
|
+
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
|
16
|
+
return()
|
|
17
|
+
endif()
|
|
35
18
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
|
36
19
|
endif()
|
|
37
20
|
|
|
38
|
-
if (
|
|
39
|
-
|
|
21
|
+
if (GGML_OPENMP_ENABLED)
|
|
22
|
+
find_dependency(OpenMP)
|
|
40
23
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
|
41
24
|
endif()
|
|
42
25
|
|
|
43
26
|
if (GGML_CPU_HBM)
|
|
44
|
-
find_library(memkind memkind
|
|
27
|
+
find_library(memkind memkind)
|
|
28
|
+
if(NOT memkind)
|
|
29
|
+
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
|
30
|
+
return()
|
|
31
|
+
endif()
|
|
45
32
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
|
46
33
|
endif()
|
|
47
34
|
|
|
48
35
|
if (GGML_BLAS)
|
|
49
|
-
|
|
36
|
+
find_dependency(BLAS)
|
|
50
37
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
|
51
38
|
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
|
52
39
|
endif()
|
|
53
40
|
|
|
54
41
|
if (GGML_CUDA)
|
|
55
|
-
|
|
42
|
+
set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
|
|
43
|
+
find_dependency(CUDAToolkit)
|
|
44
|
+
if (GGML_STATIC)
|
|
45
|
+
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
|
|
46
|
+
if (WIN32)
|
|
47
|
+
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
|
|
48
|
+
else()
|
|
49
|
+
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
|
|
50
|
+
endif()
|
|
51
|
+
endif()
|
|
52
|
+
if (NOT GGML_CUDA_NO_VMM)
|
|
53
|
+
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
|
|
54
|
+
endif()
|
|
56
55
|
endif()
|
|
57
56
|
|
|
58
57
|
if (GGML_METAL)
|
|
59
|
-
find_library(FOUNDATION_LIBRARY Foundation
|
|
60
|
-
find_library(METAL_FRAMEWORK Metal
|
|
61
|
-
find_library(METALKIT_FRAMEWORK MetalKit
|
|
58
|
+
find_library(FOUNDATION_LIBRARY Foundation)
|
|
59
|
+
find_library(METAL_FRAMEWORK Metal)
|
|
60
|
+
find_library(METALKIT_FRAMEWORK MetalKit)
|
|
61
|
+
if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
|
|
62
|
+
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
|
63
|
+
return()
|
|
64
|
+
endif()
|
|
65
|
+
set(GGML_METAL_INTERFACE_LINK_LIBRARIES
|
|
66
|
+
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
|
67
|
+
endif()
|
|
62
68
|
|
|
63
|
-
|
|
64
|
-
|
|
69
|
+
if (GGML_OPENCL)
|
|
70
|
+
find_dependency(OpenCL)
|
|
71
|
+
set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
|
|
65
72
|
endif()
|
|
66
73
|
|
|
67
74
|
if (GGML_VULKAN)
|
|
68
|
-
|
|
69
|
-
|
|
75
|
+
find_dependency(Vulkan)
|
|
76
|
+
set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
|
|
70
77
|
endif()
|
|
71
78
|
|
|
72
79
|
if (GGML_HIP)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
80
|
+
find_dependency(hip)
|
|
81
|
+
find_dependency(hipblas)
|
|
82
|
+
find_dependency(rocblas)
|
|
83
|
+
set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
|
77
84
|
endif()
|
|
78
85
|
|
|
79
86
|
if (GGML_SYCL)
|
|
87
|
+
set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
|
|
80
88
|
find_package(DNNL)
|
|
81
89
|
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
82
90
|
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
|
83
91
|
endif()
|
|
84
92
|
if (WIN32)
|
|
85
|
-
|
|
86
|
-
|
|
93
|
+
find_dependency(IntelSYCL)
|
|
94
|
+
find_dependency(MKL)
|
|
87
95
|
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
|
88
96
|
endif()
|
|
89
97
|
endif()
|
|
90
98
|
endif()
|
|
91
99
|
|
|
100
|
+
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
|
101
|
+
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
|
102
|
+
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
|
103
|
+
|
|
104
|
+
if(NOT TARGET ggml::ggml)
|
|
105
|
+
|
|
106
|
+
find_package(Threads REQUIRED)
|
|
107
|
+
|
|
108
|
+
find_library(GGML_LIBRARY ggml
|
|
109
|
+
REQUIRED
|
|
110
|
+
HINTS ${GGML_LIB_DIR}
|
|
111
|
+
NO_CMAKE_FIND_ROOT_PATH)
|
|
112
|
+
|
|
113
|
+
add_library(ggml::ggml UNKNOWN IMPORTED)
|
|
114
|
+
set_target_properties(ggml::ggml
|
|
115
|
+
PROPERTIES
|
|
116
|
+
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
|
117
|
+
|
|
118
|
+
find_library(GGML_BASE_LIBRARY ggml-base
|
|
119
|
+
REQUIRED
|
|
120
|
+
HINTS ${GGML_LIB_DIR}
|
|
121
|
+
NO_CMAKE_FIND_ROOT_PATH)
|
|
122
|
+
|
|
123
|
+
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
|
124
|
+
set_target_properties(ggml::ggml-base
|
|
125
|
+
PROPERTIES
|
|
126
|
+
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
|
127
|
+
|
|
92
128
|
set(_ggml_all_targets "")
|
|
93
129
|
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
|
94
130
|
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
|
@@ -149,4 +185,6 @@ set_target_properties(ggml::all
|
|
|
149
185
|
PROPERTIES
|
|
150
186
|
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
|
151
187
|
|
|
188
|
+
endif() # TARGET ggml::ggml
|
|
189
|
+
|
|
152
190
|
check_required_components(ggml)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#define GGML_WEBGPU_NAME "WebGPU"
|
|
11
|
+
|
|
12
|
+
// Needed for examples in ggml
|
|
13
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
|
|
22
22
|
return t->view_src != NULL;
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
26
|
-
if (a->type != b->type) {
|
|
27
|
-
return false;
|
|
28
|
-
}
|
|
29
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
30
|
-
if (a->ne[i] != b->ne[i]) {
|
|
31
|
-
return false;
|
|
32
|
-
}
|
|
33
|
-
if (a->nb[i] != b->nb[i]) {
|
|
34
|
-
return false;
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
return true;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
25
|
// ops that return true for this function must not use restrict pointers for their backend implementations
|
|
41
26
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
42
27
|
switch (op) {
|
|
@@ -45,6 +45,10 @@
|
|
|
45
45
|
#include "ggml-vulkan.h"
|
|
46
46
|
#endif
|
|
47
47
|
|
|
48
|
+
#ifdef GGML_USE_WEBGPU
|
|
49
|
+
#include "ggml-webgpu.h"
|
|
50
|
+
#endif
|
|
51
|
+
|
|
48
52
|
#ifdef GGML_USE_OPENCL
|
|
49
53
|
#include "ggml-opencl.h"
|
|
50
54
|
#endif
|
|
@@ -173,6 +177,9 @@ struct ggml_backend_registry {
|
|
|
173
177
|
#ifdef GGML_USE_VULKAN
|
|
174
178
|
register_backend(ggml_backend_vk_reg());
|
|
175
179
|
#endif
|
|
180
|
+
#ifdef GGML_USE_WEBGPU
|
|
181
|
+
register_backend(ggml_backend_webgpu_reg());
|
|
182
|
+
#endif
|
|
176
183
|
#ifdef GGML_USE_OPENCL
|
|
177
184
|
register_backend(ggml_backend_opencl_reg());
|
|
178
185
|
#endif
|
|
@@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
|
352
352
|
|
|
353
353
|
// backend copy
|
|
354
354
|
|
|
355
|
-
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
356
|
-
if (a->type != b->type) {
|
|
357
|
-
return false;
|
|
358
|
-
}
|
|
359
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
360
|
-
if (a->ne[i] != b->ne[i]) {
|
|
361
|
-
return false;
|
|
362
|
-
}
|
|
363
|
-
if (a->nb[i] != b->nb[i]) {
|
|
364
|
-
return false;
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
return true;
|
|
368
|
-
}
|
|
369
|
-
|
|
370
355
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
371
356
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
|
372
357
|
|
|
@@ -662,6 +647,7 @@ struct ggml_backend_sched {
|
|
|
662
647
|
// pipeline parallelism support
|
|
663
648
|
int n_copies;
|
|
664
649
|
int cur_copy;
|
|
650
|
+
int next_copy;
|
|
665
651
|
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
|
666
652
|
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
|
667
653
|
int n_graph_inputs;
|
|
@@ -1448,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1448
1434
|
}
|
|
1449
1435
|
}
|
|
1450
1436
|
|
|
1451
|
-
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
|
1452
|
-
|
|
1453
1437
|
return GGML_STATUS_SUCCESS;
|
|
1454
1438
|
}
|
|
1455
1439
|
|
|
@@ -1550,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
|
1550
1534
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
1551
1535
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1552
1536
|
|
|
1553
|
-
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1554
|
-
|
|
1555
1537
|
ggml_backend_sched_synchronize(sched);
|
|
1556
1538
|
|
|
1539
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1540
|
+
|
|
1557
1541
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1558
1542
|
return false;
|
|
1559
1543
|
}
|
|
@@ -1565,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
1565
1549
|
|
|
1566
1550
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
1567
1551
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
|
1552
|
+
GGML_ASSERT(!sched->is_alloc);
|
|
1553
|
+
|
|
1554
|
+
sched->cur_copy = sched->next_copy;
|
|
1555
|
+
sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
|
|
1568
1556
|
|
|
1569
1557
|
ggml_backend_sched_split_graph(sched, graph);
|
|
1570
1558
|
|
|
@@ -1605,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
|
|
1605
1593
|
// if the graph is not already allocated, always use copy 0 after a synchronization
|
|
1606
1594
|
// this ensures that during generation the same copy is used every time,
|
|
1607
1595
|
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
|
1608
|
-
sched->
|
|
1596
|
+
sched->next_copy = 0;
|
|
1609
1597
|
}
|
|
1610
1598
|
}
|
|
1611
1599
|
|
|
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
|
|
77
77
|
for (int i = 0; i < final_dims; i++) {
|
|
78
78
|
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
|
79
79
|
}
|
|
80
|
+
size_t elem_offset = offset / ggml_element_size(tensor);
|
|
81
|
+
acl_storage_len += elem_offset;
|
|
80
82
|
|
|
81
83
|
// Reverse ne and stride.
|
|
82
84
|
std::reverse(acl_ne, acl_ne + final_dims);
|
|
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
|
|
84
86
|
|
|
85
87
|
aclTensor* acl_tensor = aclCreateTensor(
|
|
86
88
|
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
|
|
87
|
-
|
|
89
|
+
elem_offset, format, &acl_storage_len, 1,
|
|
88
90
|
tensor->data);
|
|
89
91
|
|
|
90
92
|
return acl_tensor;
|
|
@@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
|
|
|
99
99
|
}
|
|
100
100
|
}
|
|
101
101
|
|
|
102
|
-
void
|
|
102
|
+
void ggml_cann_op_unary(
|
|
103
103
|
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
|
104
104
|
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
105
105
|
ggml_tensor* src = dst->src[0];
|
|
@@ -111,6 +111,42 @@ void ggml_cann_unary_op(
|
|
|
111
111
|
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
+
void ggml_cann_op_unary_gated(
|
|
115
|
+
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
|
116
|
+
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
117
|
+
ggml_tensor* src0 = dst->src[0];
|
|
118
|
+
ggml_tensor* src1 = dst->src[1];
|
|
119
|
+
|
|
120
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
121
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
122
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
123
|
+
|
|
124
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
|
125
|
+
aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
|
|
126
|
+
if(src1) {
|
|
127
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
128
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
129
|
+
|
|
130
|
+
acl_src0 = ggml_cann_create_tensor(src0);
|
|
131
|
+
acl_src1 = ggml_cann_create_tensor(src1);
|
|
132
|
+
} else {
|
|
133
|
+
int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
|
|
134
|
+
size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
|
|
135
|
+
acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
|
|
136
|
+
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
|
|
137
|
+
if (swapped) {
|
|
138
|
+
std::swap(acl_src0, acl_src1);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
unary_op(ctx, acl_src0, acl_dst);
|
|
143
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
|
|
144
|
+
|
|
145
|
+
ggml_cann_release_resources(ctx, acl_src0, acl_dst);
|
|
146
|
+
if(src1)
|
|
147
|
+
ggml_cann_release_resources(ctx, acl_src1);
|
|
148
|
+
}
|
|
149
|
+
|
|
114
150
|
/**
|
|
115
151
|
* @brief Repeats elements of a tensor along each dimension according to the
|
|
116
152
|
* specified repeat array.
|
|
@@ -1785,8 +1821,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
|
1785
1821
|
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
|
1786
1822
|
bcast_weight_nb[2], bcast_weight_nb[3],
|
|
1787
1823
|
bcast_weight_nb[4], bcast_weight_nb[5]};
|
|
1788
|
-
aclTensor* acl_weight_tensor
|
|
1789
|
-
|
|
1824
|
+
aclTensor* acl_weight_tensor;
|
|
1825
|
+
|
|
1826
|
+
bool weightToNZ = false;
|
|
1827
|
+
#ifdef ASCEND_310P
|
|
1828
|
+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
|
1829
|
+
#endif
|
|
1830
|
+
if (weightToNZ && is_matmul_weight(weight)) {
|
|
1831
|
+
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
|
1832
|
+
|
|
1833
|
+
// Reverse ne.
|
|
1834
|
+
std::reverse(transpose_ne, transpose_ne + n_dims);
|
|
1835
|
+
|
|
1836
|
+
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
|
1837
|
+
|
|
1838
|
+
acl_weight_tensor = aclCreateTensor(
|
|
1839
|
+
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
|
1840
|
+
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
|
1841
|
+
} else {
|
|
1842
|
+
acl_weight_tensor =
|
|
1843
|
+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
|
1844
|
+
}
|
|
1790
1845
|
aclTensor* acl_dst =
|
|
1791
1846
|
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
|
1792
1847
|
|