@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,641 +1,157 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include "ggml.h"
|
|
4
|
-
|
|
5
3
|
// GGML internal header
|
|
6
4
|
|
|
5
|
+
#include "ggml.h"
|
|
7
6
|
#include <assert.h>
|
|
7
|
+
#include <math.h>
|
|
8
8
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
|
9
|
-
#include <stddef.h>
|
|
10
9
|
#include <stdbool.h>
|
|
11
|
-
#include <
|
|
12
|
-
#include <
|
|
13
|
-
|
|
14
|
-
#undef MIN
|
|
15
|
-
#undef MAX
|
|
16
|
-
|
|
17
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
18
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
19
|
-
|
|
20
|
-
#if defined(_MSC_VER)
|
|
21
|
-
|
|
22
|
-
#define m512bh(p) p
|
|
23
|
-
#define m512i(p) p
|
|
24
|
-
|
|
25
|
-
#else
|
|
26
|
-
|
|
27
|
-
#define m512bh(p) (__m512bh)(p)
|
|
28
|
-
#define m512i(p) (__m512i)(p)
|
|
29
|
-
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
|
-
/**
|
|
33
|
-
* Converts brain16 to float32.
|
|
34
|
-
*
|
|
35
|
-
* The bfloat16 floating point format has the following structure:
|
|
36
|
-
*
|
|
37
|
-
* ┌sign
|
|
38
|
-
* │
|
|
39
|
-
* │ ┌exponent
|
|
40
|
-
* │ │
|
|
41
|
-
* │ │ ┌mantissa
|
|
42
|
-
* │ │ │
|
|
43
|
-
* │┌──┴───┐┌─┴───┐
|
|
44
|
-
* 0b0000000000000000 brain16
|
|
45
|
-
*
|
|
46
|
-
* Since bf16 has the same number of exponent bits as a 32bit float,
|
|
47
|
-
* encoding and decoding numbers becomes relatively straightforward.
|
|
48
|
-
*
|
|
49
|
-
* ┌sign
|
|
50
|
-
* │
|
|
51
|
-
* │ ┌exponent
|
|
52
|
-
* │ │
|
|
53
|
-
* │ │ ┌mantissa
|
|
54
|
-
* │ │ │
|
|
55
|
-
* │┌──┴───┐┌─┴───────────────────┐
|
|
56
|
-
* 0b00000000000000000000000000000000 IEEE binary32
|
|
57
|
-
*
|
|
58
|
-
* For comparison, the standard fp16 format has fewer exponent bits.
|
|
59
|
-
*
|
|
60
|
-
* ┌sign
|
|
61
|
-
* │
|
|
62
|
-
* │ ┌exponent
|
|
63
|
-
* │ │
|
|
64
|
-
* │ │ ┌mantissa
|
|
65
|
-
* │ │ │
|
|
66
|
-
* │┌─┴─┐┌─┴──────┐
|
|
67
|
-
* 0b0000000000000000 IEEE binary16
|
|
68
|
-
*
|
|
69
|
-
* @see IEEE 754-2008
|
|
70
|
-
*/
|
|
71
|
-
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
|
72
|
-
union {
|
|
73
|
-
float f;
|
|
74
|
-
uint32_t i;
|
|
75
|
-
} u;
|
|
76
|
-
u.i = (uint32_t)h.bits << 16;
|
|
77
|
-
return u.f;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Converts float32 to brain16.
|
|
82
|
-
*
|
|
83
|
-
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
|
84
|
-
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
|
85
|
-
* This code should vectorize nicely if using modern compilers.
|
|
86
|
-
*/
|
|
87
|
-
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|
88
|
-
ggml_bf16_t h;
|
|
89
|
-
union {
|
|
90
|
-
float f;
|
|
91
|
-
uint32_t i;
|
|
92
|
-
} u;
|
|
93
|
-
u.f = s;
|
|
94
|
-
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
|
95
|
-
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
|
96
|
-
return h;
|
|
97
|
-
}
|
|
98
|
-
if (!(u.i & 0x7f800000)) { /* subnormal */
|
|
99
|
-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
|
100
|
-
return h;
|
|
101
|
-
}
|
|
102
|
-
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
|
103
|
-
return h;
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
|
107
|
-
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
|
108
|
-
|
|
109
|
-
#ifdef __cplusplus
|
|
110
|
-
extern "C" {
|
|
111
|
-
#endif
|
|
112
|
-
|
|
113
|
-
// static_assert should be a #define, but if it's not,
|
|
114
|
-
// fall back to the _Static_assert C11 keyword.
|
|
115
|
-
// if C99 - static_assert is noop
|
|
116
|
-
// ref: https://stackoverflow.com/a/53923785/4039976
|
|
117
|
-
#ifndef __cplusplus
|
|
118
|
-
#ifndef static_assert
|
|
119
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
|
120
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
121
|
-
#else
|
|
122
|
-
#define static_assert(cond, msg) struct global_scope_noop_trick
|
|
123
|
-
#endif
|
|
124
|
-
#endif
|
|
125
|
-
#endif
|
|
10
|
+
#include <stdint.h>
|
|
11
|
+
#include <string.h>
|
|
126
12
|
|
|
127
|
-
|
|
128
|
-
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
|
129
|
-
#ifndef __FMA__
|
|
130
|
-
#define __FMA__
|
|
131
|
-
#endif
|
|
132
|
-
#ifndef __F16C__
|
|
133
|
-
#define __F16C__
|
|
134
|
-
#endif
|
|
135
|
-
#endif
|
|
136
|
-
|
|
137
|
-
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
|
138
|
-
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
|
139
|
-
#ifndef __SSE3__
|
|
140
|
-
#define __SSE3__
|
|
141
|
-
#endif
|
|
142
|
-
#ifndef __SSSE3__
|
|
143
|
-
#define __SSSE3__
|
|
144
|
-
#endif
|
|
145
|
-
#endif
|
|
146
|
-
|
|
147
|
-
#if defined(__ARM_FEATURE_SVE)
|
|
13
|
+
#ifdef __ARM_FEATURE_SVE
|
|
148
14
|
#include <arm_sve.h>
|
|
149
|
-
#endif
|
|
15
|
+
#endif // __ARM_FEATURE_SVE
|
|
150
16
|
|
|
151
|
-
// 16-bit float
|
|
152
|
-
// on Arm, we use __fp16
|
|
153
|
-
// on x86, we use uint16_t
|
|
154
17
|
#if defined(__ARM_NEON)
|
|
155
|
-
|
|
156
18
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
157
19
|
//
|
|
158
20
|
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
159
21
|
//
|
|
160
22
|
#include <arm_neon.h>
|
|
23
|
+
#endif
|
|
161
24
|
|
|
162
|
-
#
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
|
167
|
-
|
|
168
|
-
#else
|
|
169
|
-
|
|
170
|
-
typedef __fp16 ggml_fp16_internal_t;
|
|
171
|
-
|
|
172
|
-
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
|
173
|
-
|
|
174
|
-
#endif // _MSC_VER
|
|
175
|
-
|
|
176
|
-
#if !defined(__aarch64__)
|
|
177
|
-
|
|
178
|
-
// 32-bit ARM compatibility
|
|
179
|
-
|
|
180
|
-
// vaddvq_s16
|
|
181
|
-
// vpaddq_s16
|
|
182
|
-
// vpaddq_s32
|
|
183
|
-
// vaddvq_s32
|
|
184
|
-
// vaddvq_f32
|
|
185
|
-
// vmaxvq_f32
|
|
186
|
-
// vcvtnq_s32_f32
|
|
187
|
-
// vzip1_u8
|
|
188
|
-
// vzip2_u8
|
|
189
|
-
|
|
190
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
|
191
|
-
return
|
|
192
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
|
193
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
|
194
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
|
195
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
199
|
-
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
|
200
|
-
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
|
201
|
-
return vcombine_s16(a0, b0);
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
|
205
|
-
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
|
206
|
-
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
|
207
|
-
return vcombine_s32(a0, b0);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
211
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
|
215
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
inline static float vmaxvq_f32(float32x4_t v) {
|
|
219
|
-
return
|
|
220
|
-
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
221
|
-
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
225
|
-
int32x4_t res;
|
|
226
|
-
|
|
227
|
-
res[0] = roundf(vgetq_lane_f32(v, 0));
|
|
228
|
-
res[1] = roundf(vgetq_lane_f32(v, 1));
|
|
229
|
-
res[2] = roundf(vgetq_lane_f32(v, 2));
|
|
230
|
-
res[3] = roundf(vgetq_lane_f32(v, 3));
|
|
231
|
-
|
|
232
|
-
return res;
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
|
236
|
-
uint8x8_t res;
|
|
237
|
-
|
|
238
|
-
res[0] = a[0]; res[1] = b[0];
|
|
239
|
-
res[2] = a[1]; res[3] = b[1];
|
|
240
|
-
res[4] = a[2]; res[5] = b[2];
|
|
241
|
-
res[6] = a[3]; res[7] = b[3];
|
|
242
|
-
|
|
243
|
-
return res;
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
|
247
|
-
uint8x8_t res;
|
|
248
|
-
|
|
249
|
-
res[0] = a[4]; res[1] = b[4];
|
|
250
|
-
res[2] = a[5]; res[3] = b[5];
|
|
251
|
-
res[4] = a[6]; res[5] = b[6];
|
|
252
|
-
res[6] = a[7]; res[7] = b[7];
|
|
253
|
-
|
|
254
|
-
return res;
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// vld1q_s16_x2
|
|
258
|
-
// vld1q_u8_x2
|
|
259
|
-
// vld1q_u8_x4
|
|
260
|
-
// vld1q_s8_x2
|
|
261
|
-
// vld1q_s8_x4
|
|
262
|
-
// TODO: double-check these work correctly
|
|
263
|
-
|
|
264
|
-
typedef struct ggml_int16x8x2_t {
|
|
265
|
-
int16x8_t val[2];
|
|
266
|
-
} ggml_int16x8x2_t;
|
|
267
|
-
|
|
268
|
-
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
|
269
|
-
ggml_int16x8x2_t res;
|
|
270
|
-
|
|
271
|
-
res.val[0] = vld1q_s16(ptr + 0);
|
|
272
|
-
res.val[1] = vld1q_s16(ptr + 8);
|
|
273
|
-
|
|
274
|
-
return res;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
typedef struct ggml_uint8x16x2_t {
|
|
278
|
-
uint8x16_t val[2];
|
|
279
|
-
} ggml_uint8x16x2_t;
|
|
280
|
-
|
|
281
|
-
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
|
282
|
-
ggml_uint8x16x2_t res;
|
|
283
|
-
|
|
284
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
|
285
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
|
286
|
-
|
|
287
|
-
return res;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
typedef struct ggml_uint8x16x4_t {
|
|
291
|
-
uint8x16_t val[4];
|
|
292
|
-
} ggml_uint8x16x4_t;
|
|
293
|
-
|
|
294
|
-
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
|
295
|
-
ggml_uint8x16x4_t res;
|
|
296
|
-
|
|
297
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
|
298
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
|
299
|
-
res.val[2] = vld1q_u8(ptr + 32);
|
|
300
|
-
res.val[3] = vld1q_u8(ptr + 48);
|
|
301
|
-
|
|
302
|
-
return res;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
typedef struct ggml_int8x16x2_t {
|
|
306
|
-
int8x16_t val[2];
|
|
307
|
-
} ggml_int8x16x2_t;
|
|
308
|
-
|
|
309
|
-
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
|
310
|
-
ggml_int8x16x2_t res;
|
|
311
|
-
|
|
312
|
-
res.val[0] = vld1q_s8(ptr + 0);
|
|
313
|
-
res.val[1] = vld1q_s8(ptr + 16);
|
|
314
|
-
|
|
315
|
-
return res;
|
|
316
|
-
}
|
|
25
|
+
#if defined(__F16C__)
|
|
26
|
+
#include <immintrin.h>
|
|
27
|
+
#endif
|
|
317
28
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
29
|
+
#ifdef __cplusplus
|
|
30
|
+
extern "C" {
|
|
31
|
+
#endif
|
|
321
32
|
|
|
322
|
-
|
|
323
|
-
|
|
33
|
+
#undef MIN
|
|
34
|
+
#undef MAX
|
|
324
35
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
res.val[2] = vld1q_s8(ptr + 32);
|
|
328
|
-
res.val[3] = vld1q_s8(ptr + 48);
|
|
36
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
37
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
329
38
|
|
|
330
|
-
|
|
331
|
-
|
|
39
|
+
// required for mmap as gguf only guarantees 32-byte alignment
|
|
40
|
+
#define TENSOR_ALIGNMENT 32
|
|
332
41
|
|
|
333
|
-
//
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
res[ 9] = a[b[ 9]];
|
|
347
|
-
res[10] = a[b[10]];
|
|
348
|
-
res[11] = a[b[11]];
|
|
349
|
-
res[12] = a[b[12]];
|
|
350
|
-
res[13] = a[b[13]];
|
|
351
|
-
res[14] = a[b[14]];
|
|
352
|
-
res[15] = a[b[15]];
|
|
353
|
-
|
|
354
|
-
return res;
|
|
355
|
-
}
|
|
42
|
+
// static_assert should be a #define, but if it's not,
|
|
43
|
+
// fall back to the _Static_assert C11 keyword.
|
|
44
|
+
// if C99 - static_assert is noop
|
|
45
|
+
// ref: https://stackoverflow.com/a/53923785/4039976
|
|
46
|
+
#ifndef __cplusplus
|
|
47
|
+
#ifndef static_assert
|
|
48
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
|
49
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
50
|
+
#else
|
|
51
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
|
54
|
+
#endif
|
|
356
55
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
uint8x16_t res;
|
|
360
|
-
|
|
361
|
-
res[ 0] = a[b[ 0]];
|
|
362
|
-
res[ 1] = a[b[ 1]];
|
|
363
|
-
res[ 2] = a[b[ 2]];
|
|
364
|
-
res[ 3] = a[b[ 3]];
|
|
365
|
-
res[ 4] = a[b[ 4]];
|
|
366
|
-
res[ 5] = a[b[ 5]];
|
|
367
|
-
res[ 6] = a[b[ 6]];
|
|
368
|
-
res[ 7] = a[b[ 7]];
|
|
369
|
-
res[ 8] = a[b[ 8]];
|
|
370
|
-
res[ 9] = a[b[ 9]];
|
|
371
|
-
res[10] = a[b[10]];
|
|
372
|
-
res[11] = a[b[11]];
|
|
373
|
-
res[12] = a[b[12]];
|
|
374
|
-
res[13] = a[b[13]];
|
|
375
|
-
res[14] = a[b[14]];
|
|
376
|
-
res[15] = a[b[15]];
|
|
377
|
-
|
|
378
|
-
return res;
|
|
56
|
+
static inline int ggml_up32(int n) {
|
|
57
|
+
return (n + 31) & ~31;
|
|
379
58
|
}
|
|
380
59
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
#define ggml_uint8x16x2_t uint8x16x2_t
|
|
385
|
-
#define ggml_uint8x16x4_t uint8x16x4_t
|
|
386
|
-
#define ggml_int8x16x2_t int8x16x2_t
|
|
387
|
-
#define ggml_int8x16x4_t int8x16x4_t
|
|
388
|
-
|
|
389
|
-
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
|
390
|
-
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
|
391
|
-
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
|
392
|
-
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
|
393
|
-
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
|
394
|
-
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
|
395
|
-
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
|
396
|
-
|
|
397
|
-
#endif // !defined(__aarch64__)
|
|
60
|
+
//static inline int ggml_up64(int n) {
|
|
61
|
+
// return (n + 63) & ~63;
|
|
62
|
+
//}
|
|
398
63
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
|
404
|
-
|
|
405
|
-
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
|
64
|
+
static inline int ggml_up(int n, int m) {
|
|
65
|
+
// assert m is a power of 2
|
|
66
|
+
GGML_ASSERT((m & (m - 1)) == 0);
|
|
67
|
+
return (n + m - 1) & ~(m - 1);
|
|
406
68
|
}
|
|
407
69
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
|
413
|
-
|
|
414
|
-
#endif // defined(__ARM_NEON)
|
|
415
|
-
|
|
416
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
417
|
-
|
|
418
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
419
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
420
|
-
|
|
421
|
-
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
70
|
+
//
|
|
71
|
+
// logging
|
|
72
|
+
//
|
|
422
73
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
return (float)tmp;
|
|
427
|
-
}
|
|
74
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
|
75
|
+
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
|
76
|
+
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
|
428
77
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
78
|
+
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
|
79
|
+
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
80
|
+
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
81
|
+
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
82
|
+
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
|
83
|
+
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
|
435
84
|
|
|
436
|
-
#
|
|
85
|
+
#define GGML_DEBUG 0
|
|
437
86
|
|
|
438
|
-
#
|
|
439
|
-
#
|
|
440
|
-
#else
|
|
441
|
-
#ifdef __POWER9_VECTOR__
|
|
442
|
-
#include <altivec.h>
|
|
443
|
-
#undef bool
|
|
444
|
-
#define bool _Bool
|
|
87
|
+
#if (GGML_DEBUG >= 1)
|
|
88
|
+
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
445
89
|
#else
|
|
446
|
-
#
|
|
447
|
-
#include <intrin.h>
|
|
448
|
-
#else
|
|
449
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
|
450
|
-
#if !defined(__riscv)
|
|
451
|
-
#include <immintrin.h>
|
|
452
|
-
#endif
|
|
453
|
-
#endif
|
|
454
|
-
#endif
|
|
455
|
-
#endif
|
|
456
|
-
#endif
|
|
457
|
-
|
|
458
|
-
#ifdef __riscv_v_intrinsic
|
|
459
|
-
#include <riscv_vector.h>
|
|
460
|
-
#endif
|
|
461
|
-
|
|
462
|
-
#if defined(__loongarch64)
|
|
463
|
-
#if defined(__loongarch_asx)
|
|
464
|
-
#include <lasxintrin.h>
|
|
465
|
-
#endif
|
|
466
|
-
#if defined(__loongarch_sx)
|
|
467
|
-
#include <lsxintrin.h>
|
|
90
|
+
#define GGML_PRINT_DEBUG(...)
|
|
468
91
|
#endif
|
|
469
|
-
#endif
|
|
470
|
-
|
|
471
|
-
#if defined(__loongarch_asx)
|
|
472
|
-
|
|
473
|
-
typedef union {
|
|
474
|
-
int32_t i;
|
|
475
|
-
float f;
|
|
476
|
-
} ft_union;
|
|
477
|
-
|
|
478
|
-
/* float type data load instructions */
|
|
479
|
-
static __m128 __lsx_vreplfr2vr_s(float val) {
|
|
480
|
-
ft_union fi_tmpval = {.f = val};
|
|
481
|
-
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
|
482
|
-
}
|
|
483
92
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
93
|
+
#if (GGML_DEBUG >= 5)
|
|
94
|
+
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
95
|
+
#else
|
|
96
|
+
#define GGML_PRINT_DEBUG_5(...)
|
|
488
97
|
#endif
|
|
489
98
|
|
|
490
|
-
#
|
|
491
|
-
|
|
492
|
-
#ifdef _MSC_VER
|
|
493
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
494
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
99
|
+
#if (GGML_DEBUG >= 10)
|
|
100
|
+
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
495
101
|
#else
|
|
496
|
-
#define
|
|
497
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
102
|
+
#define GGML_PRINT_DEBUG_10(...)
|
|
498
103
|
#endif
|
|
499
104
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
503
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
504
|
-
/* the inline asm below is about 12% faster than the lookup method */
|
|
505
|
-
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
506
|
-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
105
|
+
// tensor params
|
|
507
106
|
|
|
508
|
-
static
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
"mtfprd %0,%2\n"
|
|
513
|
-
"xscvhpdp %0,%0\n"
|
|
514
|
-
"frsp %1,%0\n" :
|
|
515
|
-
/* temp */ "=d"(d),
|
|
516
|
-
/* out */ "=f"(f):
|
|
517
|
-
/* in */ "r"(h));
|
|
518
|
-
return f;
|
|
107
|
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
|
108
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
|
109
|
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
|
110
|
+
memcpy(tensor->op_params, params, params_size);
|
|
519
111
|
}
|
|
520
112
|
|
|
521
|
-
static
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
__asm__( /* xscvdphp can work on double or single precision */
|
|
525
|
-
"xscvdphp %0,%2\n"
|
|
526
|
-
"mffprd %1,%0\n" :
|
|
527
|
-
/* temp */ "=d"(d),
|
|
528
|
-
/* out */ "=r"(r):
|
|
529
|
-
/* in */ "f"(f));
|
|
530
|
-
return r;
|
|
113
|
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
|
114
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
|
115
|
+
return ((const int32_t *)(tensor->op_params))[i];
|
|
531
116
|
}
|
|
532
117
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
// ref: https://github.com/Maratyszcza/FP16
|
|
537
|
-
|
|
538
|
-
static inline float fp32_from_bits(uint32_t w) {
|
|
539
|
-
union {
|
|
540
|
-
uint32_t as_bits;
|
|
541
|
-
float as_value;
|
|
542
|
-
} fp32;
|
|
543
|
-
fp32.as_bits = w;
|
|
544
|
-
return fp32.as_value;
|
|
118
|
+
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
|
119
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
|
120
|
+
return ((const float *)(tensor->op_params))[i];
|
|
545
121
|
}
|
|
546
122
|
|
|
547
|
-
static
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
uint32_t as_bits;
|
|
551
|
-
} fp32;
|
|
552
|
-
fp32.as_value = f;
|
|
553
|
-
return fp32.as_bits;
|
|
123
|
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
|
124
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
|
125
|
+
((int32_t *)(tensor->op_params))[i] = value;
|
|
554
126
|
}
|
|
555
127
|
|
|
556
|
-
static
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
const uint32_t two_w = w + w;
|
|
560
|
-
|
|
561
|
-
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
562
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
563
|
-
const float exp_scale = 0x1.0p-112f;
|
|
564
|
-
#else
|
|
565
|
-
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
566
|
-
#endif
|
|
567
|
-
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
568
|
-
|
|
569
|
-
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
570
|
-
const float magic_bias = 0.5f;
|
|
571
|
-
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
572
|
-
|
|
573
|
-
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
574
|
-
const uint32_t result = sign |
|
|
575
|
-
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
576
|
-
return fp32_from_bits(result);
|
|
128
|
+
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
|
129
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
|
130
|
+
((float *)(tensor->op_params))[i] = value;
|
|
577
131
|
}
|
|
578
132
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
585
|
-
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
586
|
-
#endif
|
|
587
|
-
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
588
|
-
|
|
589
|
-
const uint32_t w = fp32_to_bits(f);
|
|
590
|
-
const uint32_t shl1_w = w + w;
|
|
591
|
-
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
592
|
-
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
593
|
-
if (bias < UINT32_C(0x71000000)) {
|
|
594
|
-
bias = UINT32_C(0x71000000);
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
598
|
-
const uint32_t bits = fp32_to_bits(base);
|
|
599
|
-
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
600
|
-
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
601
|
-
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
602
|
-
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
606
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
607
|
-
|
|
608
|
-
#endif // __F16C__
|
|
609
|
-
|
|
610
|
-
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
|
611
|
-
|
|
612
|
-
#ifdef __ARM_FEATURE_SVE
|
|
613
|
-
#include <arm_sve.h>
|
|
614
|
-
#endif // __ARM_FEATURE_SVE
|
|
615
|
-
|
|
616
|
-
// precomputed f32 table for f16 (256 KB)
|
|
617
|
-
// defined in ggml.c, initialized in ggml_init()
|
|
618
|
-
extern float ggml_table_f32_f16[1 << 16];
|
|
619
|
-
|
|
620
|
-
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
621
|
-
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
622
|
-
// This is also true for POWER9.
|
|
623
|
-
#if !defined(GGML_FP16_TO_FP32)
|
|
624
|
-
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
625
|
-
uint16_t s;
|
|
626
|
-
memcpy(&s, &f, sizeof(uint16_t));
|
|
627
|
-
return ggml_table_f32_f16[s];
|
|
628
|
-
}
|
|
133
|
+
struct ggml_map_custom1_op_params {
|
|
134
|
+
ggml_custom1_op_t fun;
|
|
135
|
+
int n_tasks;
|
|
136
|
+
void * userdata;
|
|
137
|
+
};
|
|
629
138
|
|
|
630
|
-
|
|
631
|
-
|
|
139
|
+
struct ggml_map_custom2_op_params {
|
|
140
|
+
ggml_custom2_op_t fun;
|
|
141
|
+
int n_tasks;
|
|
142
|
+
void * userdata;
|
|
143
|
+
};
|
|
632
144
|
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
145
|
+
struct ggml_map_custom3_op_params {
|
|
146
|
+
ggml_custom3_op_t fun;
|
|
147
|
+
int n_tasks;
|
|
148
|
+
void * userdata;
|
|
149
|
+
};
|
|
636
150
|
|
|
637
151
|
// bitset
|
|
638
152
|
|
|
153
|
+
typedef uint32_t ggml_bitset_t;
|
|
154
|
+
|
|
639
155
|
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
|
640
156
|
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
|
641
157
|
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
|
@@ -661,6 +177,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
|
|
661
177
|
#define GGML_HASHSET_FULL ((size_t)-1)
|
|
662
178
|
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
|
663
179
|
|
|
180
|
+
struct ggml_hash_set {
|
|
181
|
+
size_t size;
|
|
182
|
+
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
|
183
|
+
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
|
184
|
+
};
|
|
185
|
+
|
|
664
186
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
|
665
187
|
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
|
666
188
|
|
|
@@ -674,7 +196,7 @@ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
|
|
674
196
|
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
675
197
|
|
|
676
198
|
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
|
677
|
-
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
199
|
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key);
|
|
678
200
|
|
|
679
201
|
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
|
680
202
|
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
|
@@ -688,7 +210,7 @@ static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
|
|
688
210
|
return (size_t)(uintptr_t)p >> 4;
|
|
689
211
|
}
|
|
690
212
|
|
|
691
|
-
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
|
213
|
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) {
|
|
692
214
|
size_t h = ggml_hash(key) % hash_set->size;
|
|
693
215
|
|
|
694
216
|
// linear probing
|
|
@@ -750,6 +272,280 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
|
|
750
272
|
GGML_ABORT("fatal error");
|
|
751
273
|
}
|
|
752
274
|
|
|
275
|
+
// computation graph
|
|
276
|
+
|
|
277
|
+
enum ggml_cgraph_eval_order {
|
|
278
|
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
|
279
|
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
|
280
|
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
struct ggml_cgraph {
|
|
284
|
+
int size; // maximum number of nodes/leafs/grads/grad_accs
|
|
285
|
+
int n_nodes; // number of nodes currently in use
|
|
286
|
+
int n_leafs; // number of leafs currently in use
|
|
287
|
+
|
|
288
|
+
struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
|
|
289
|
+
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
|
|
290
|
+
struct ggml_tensor ** grad_accs; // accumulators for node gradients
|
|
291
|
+
struct ggml_tensor ** leafs; // tensors with constant data
|
|
292
|
+
|
|
293
|
+
struct ggml_hash_set visited_hash_set;
|
|
294
|
+
|
|
295
|
+
enum ggml_cgraph_eval_order order;
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
|
299
|
+
|
|
300
|
+
// Memory allocation
|
|
301
|
+
|
|
302
|
+
void * ggml_aligned_malloc(size_t size);
|
|
303
|
+
void ggml_aligned_free(void * ptr, size_t size);
|
|
304
|
+
|
|
305
|
+
// FP16 to FP32 conversion
|
|
306
|
+
|
|
307
|
+
#if defined(__ARM_NEON)
|
|
308
|
+
#ifdef _MSC_VER
|
|
309
|
+
typedef uint16_t ggml_fp16_internal_t;
|
|
310
|
+
#else
|
|
311
|
+
typedef __fp16 ggml_fp16_internal_t;
|
|
312
|
+
#endif
|
|
313
|
+
#endif
|
|
314
|
+
|
|
315
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
316
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
317
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
318
|
+
|
|
319
|
+
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
320
|
+
|
|
321
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
322
|
+
ggml_fp16_internal_t tmp;
|
|
323
|
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
324
|
+
return (float)tmp;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
328
|
+
ggml_fp16_t res;
|
|
329
|
+
ggml_fp16_internal_t tmp = f;
|
|
330
|
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
331
|
+
return res;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
#elif defined(__F16C__)
|
|
335
|
+
|
|
336
|
+
#ifdef _MSC_VER
|
|
337
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
338
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
339
|
+
#else
|
|
340
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
341
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
342
|
+
#endif
|
|
343
|
+
|
|
344
|
+
#elif defined(__POWER9_VECTOR__)
|
|
345
|
+
|
|
346
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
347
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
348
|
+
/* the inline asm below is about 12% faster than the lookup method */
|
|
349
|
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
350
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
351
|
+
|
|
352
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
353
|
+
register float f;
|
|
354
|
+
register double d;
|
|
355
|
+
__asm__(
|
|
356
|
+
"mtfprd %0,%2\n"
|
|
357
|
+
"xscvhpdp %0,%0\n"
|
|
358
|
+
"frsp %1,%0\n" :
|
|
359
|
+
/* temp */ "=d"(d),
|
|
360
|
+
/* out */ "=f"(f):
|
|
361
|
+
/* in */ "r"(h));
|
|
362
|
+
return f;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
366
|
+
register double d;
|
|
367
|
+
register ggml_fp16_t r;
|
|
368
|
+
__asm__( /* xscvdphp can work on double or single precision */
|
|
369
|
+
"xscvdphp %0,%2\n"
|
|
370
|
+
"mffprd %1,%0\n" :
|
|
371
|
+
/* temp */ "=d"(d),
|
|
372
|
+
/* out */ "=r"(r):
|
|
373
|
+
/* in */ "f"(f));
|
|
374
|
+
return r;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
#else
|
|
378
|
+
|
|
379
|
+
// FP16 <-> FP32
|
|
380
|
+
// ref: https://github.com/Maratyszcza/FP16
|
|
381
|
+
|
|
382
|
+
static inline float fp32_from_bits(uint32_t w) {
|
|
383
|
+
union {
|
|
384
|
+
uint32_t as_bits;
|
|
385
|
+
float as_value;
|
|
386
|
+
} fp32;
|
|
387
|
+
fp32.as_bits = w;
|
|
388
|
+
return fp32.as_value;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
static inline uint32_t fp32_to_bits(float f) {
|
|
392
|
+
union {
|
|
393
|
+
float as_value;
|
|
394
|
+
uint32_t as_bits;
|
|
395
|
+
} fp32;
|
|
396
|
+
fp32.as_value = f;
|
|
397
|
+
return fp32.as_bits;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
401
|
+
const uint32_t w = (uint32_t) h << 16;
|
|
402
|
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
403
|
+
const uint32_t two_w = w + w;
|
|
404
|
+
|
|
405
|
+
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
406
|
+
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
|
407
|
+
const float exp_scale = 0x1.0p-112f;
|
|
408
|
+
#else
|
|
409
|
+
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
410
|
+
#endif
|
|
411
|
+
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
412
|
+
|
|
413
|
+
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
414
|
+
const float magic_bias = 0.5f;
|
|
415
|
+
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
416
|
+
|
|
417
|
+
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
418
|
+
const uint32_t result = sign |
|
|
419
|
+
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
420
|
+
return fp32_from_bits(result);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
424
|
+
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
|
425
|
+
const float scale_to_inf = 0x1.0p+112f;
|
|
426
|
+
const float scale_to_zero = 0x1.0p-110f;
|
|
427
|
+
#else
|
|
428
|
+
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
429
|
+
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
430
|
+
#endif
|
|
431
|
+
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
432
|
+
|
|
433
|
+
const uint32_t w = fp32_to_bits(f);
|
|
434
|
+
const uint32_t shl1_w = w + w;
|
|
435
|
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
436
|
+
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
437
|
+
if (bias < UINT32_C(0x71000000)) {
|
|
438
|
+
bias = UINT32_C(0x71000000);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
442
|
+
const uint32_t bits = fp32_to_bits(base);
|
|
443
|
+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
444
|
+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
445
|
+
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
446
|
+
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
450
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
451
|
+
|
|
452
|
+
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
|
453
|
+
|
|
454
|
+
// precomputed f32 table for f16 (256 KB)
|
|
455
|
+
// defined in ggml.c, initialized in ggml_init()
|
|
456
|
+
GGML_API float ggml_table_f32_f16[1 << 16];
|
|
457
|
+
|
|
458
|
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
459
|
+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
460
|
+
// This is also true for POWER9.
|
|
461
|
+
#if !defined(GGML_FP16_TO_FP32)
|
|
462
|
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
463
|
+
uint16_t s;
|
|
464
|
+
memcpy(&s, &f, sizeof(uint16_t));
|
|
465
|
+
return ggml_table_f32_f16[s];
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
469
|
+
#endif
|
|
470
|
+
|
|
471
|
+
#if !defined(GGML_FP32_TO_FP16)
|
|
472
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
473
|
+
#endif
|
|
474
|
+
|
|
475
|
+
/**
|
|
476
|
+
* Converts brain16 to float32.
|
|
477
|
+
*
|
|
478
|
+
* The bfloat16 floating point format has the following structure:
|
|
479
|
+
*
|
|
480
|
+
* ┌sign
|
|
481
|
+
* │
|
|
482
|
+
* │ ┌exponent
|
|
483
|
+
* │ │
|
|
484
|
+
* │ │ ┌mantissa
|
|
485
|
+
* │ │ │
|
|
486
|
+
* │┌──┴───┐┌─┴───┐
|
|
487
|
+
* 0b0000000000000000 brain16
|
|
488
|
+
*
|
|
489
|
+
* Since bf16 has the same number of exponent bits as a 32bit float,
|
|
490
|
+
* encoding and decoding numbers becomes relatively straightforward.
|
|
491
|
+
*
|
|
492
|
+
* ┌sign
|
|
493
|
+
* │
|
|
494
|
+
* │ ┌exponent
|
|
495
|
+
* │ │
|
|
496
|
+
* │ │ ┌mantissa
|
|
497
|
+
* │ │ │
|
|
498
|
+
* │┌──┴───┐┌─┴───────────────────┐
|
|
499
|
+
* 0b00000000000000000000000000000000 IEEE binary32
|
|
500
|
+
*
|
|
501
|
+
* For comparison, the standard fp16 format has fewer exponent bits.
|
|
502
|
+
*
|
|
503
|
+
* ┌sign
|
|
504
|
+
* │
|
|
505
|
+
* │ ┌exponent
|
|
506
|
+
* │ │
|
|
507
|
+
* │ │ ┌mantissa
|
|
508
|
+
* │ │ │
|
|
509
|
+
* │┌─┴─┐┌─┴──────┐
|
|
510
|
+
* 0b0000000000000000 IEEE binary16
|
|
511
|
+
*
|
|
512
|
+
* @see IEEE 754-2008
|
|
513
|
+
*/
|
|
514
|
+
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
|
515
|
+
union {
|
|
516
|
+
float f;
|
|
517
|
+
uint32_t i;
|
|
518
|
+
} u;
|
|
519
|
+
u.i = (uint32_t)h.bits << 16;
|
|
520
|
+
return u.f;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
/**
|
|
524
|
+
* Converts float32 to brain16.
|
|
525
|
+
*
|
|
526
|
+
* This is binary identical with Google Brain float conversion.
|
|
527
|
+
* Floats shall round to nearest even, and NANs shall be quiet.
|
|
528
|
+
* Subnormals aren't flushed to zero, except perhaps when used.
|
|
529
|
+
* This code should vectorize nicely if using modern compilers.
|
|
530
|
+
*/
|
|
531
|
+
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|
532
|
+
ggml_bf16_t h;
|
|
533
|
+
union {
|
|
534
|
+
float f;
|
|
535
|
+
uint32_t i;
|
|
536
|
+
} u;
|
|
537
|
+
u.f = s;
|
|
538
|
+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
|
539
|
+
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
|
540
|
+
return h;
|
|
541
|
+
}
|
|
542
|
+
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
|
543
|
+
return h;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
|
547
|
+
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
|
548
|
+
|
|
753
549
|
#ifdef __cplusplus
|
|
754
550
|
}
|
|
755
551
|
#endif
|