@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
|
|
|
158
158
|
|
|
159
159
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
|
160
160
|
|
|
161
|
+
#ifdef _MSC_VER
|
|
162
|
+
#define GGML_EXTENSION
|
|
163
|
+
#else // _MSC_VER
|
|
164
|
+
#define GGML_EXTENSION __extension__
|
|
165
|
+
#endif // _MSC_VER
|
|
166
|
+
|
|
161
167
|
#define QK4_0 32
|
|
162
168
|
typedef struct {
|
|
163
169
|
ggml_half d; // delta
|
|
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
|
|
|
167
173
|
|
|
168
174
|
#define QK4_1 32
|
|
169
175
|
typedef struct {
|
|
170
|
-
union {
|
|
176
|
+
GGML_EXTENSION union {
|
|
171
177
|
struct {
|
|
172
178
|
ggml_half d; // delta
|
|
173
179
|
ggml_half m; // min
|
|
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
|
|
|
188
194
|
|
|
189
195
|
#define QK5_1 32
|
|
190
196
|
typedef struct {
|
|
191
|
-
union {
|
|
197
|
+
GGML_EXTENSION union {
|
|
192
198
|
struct {
|
|
193
199
|
ggml_half d; // delta
|
|
194
200
|
ggml_half m; // min
|
|
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
|
|
|
209
215
|
|
|
210
216
|
#define QK8_1 32
|
|
211
217
|
typedef struct {
|
|
212
|
-
union {
|
|
218
|
+
GGML_EXTENSION union {
|
|
213
219
|
struct {
|
|
214
220
|
ggml_half d; // delta
|
|
215
221
|
ggml_half s; // d * sum(qs[i])
|
|
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
|
|
|
250
256
|
typedef struct {
|
|
251
257
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
|
252
258
|
uint8_t qs[QK_K/4]; // quants
|
|
253
|
-
union {
|
|
259
|
+
GGML_EXTENSION union {
|
|
254
260
|
struct {
|
|
255
261
|
ggml_half d; // super-block scale for quantized scales
|
|
256
262
|
ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
|
|
|
277
283
|
// weight is represented as x = a * q + b
|
|
278
284
|
// Effectively 4.5 bits per weight
|
|
279
285
|
typedef struct {
|
|
280
|
-
union {
|
|
286
|
+
GGML_EXTENSION union {
|
|
281
287
|
struct {
|
|
282
288
|
ggml_half d; // super-block scale for quantized scales
|
|
283
289
|
ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
|
|
|
294
300
|
// weight is represented as x = a * q + b
|
|
295
301
|
// Effectively 5.5 bits per weight
|
|
296
302
|
typedef struct {
|
|
297
|
-
union {
|
|
303
|
+
GGML_EXTENSION union {
|
|
298
304
|
struct {
|
|
299
305
|
ggml_half d; // super-block scale for quantized scales
|
|
300
306
|
ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -23,6 +23,16 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
23
23
|
ggml-cpu/amx/mmq.cpp
|
|
24
24
|
ggml-cpu/amx/mmq.h
|
|
25
25
|
ggml-cpu/ggml-cpu-impl.h
|
|
26
|
+
ggml-cpu/common.h
|
|
27
|
+
ggml-cpu/binary-ops.h
|
|
28
|
+
ggml-cpu/binary-ops.cpp
|
|
29
|
+
ggml-cpu/unary-ops.h
|
|
30
|
+
ggml-cpu/unary-ops.cpp
|
|
31
|
+
ggml-cpu/simd-mappings.h
|
|
32
|
+
ggml-cpu/vec.h
|
|
33
|
+
ggml-cpu/vec.cpp
|
|
34
|
+
ggml-cpu/ops.h
|
|
35
|
+
ggml-cpu/ops.cpp
|
|
26
36
|
)
|
|
27
37
|
|
|
28
38
|
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
|
@@ -212,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
212
222
|
elseif (GGML_AVX)
|
|
213
223
|
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
214
224
|
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
|
215
|
-
|
|
225
|
+
elseif (GGML_SSE42)
|
|
216
226
|
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
|
217
227
|
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
|
218
228
|
endif()
|
|
@@ -227,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
227
237
|
if (GGML_NATIVE)
|
|
228
238
|
list(APPEND ARCH_FLAGS -march=native)
|
|
229
239
|
else ()
|
|
230
|
-
|
|
231
|
-
|
|
240
|
+
if (GGML_SSE42)
|
|
241
|
+
list(APPEND ARCH_FLAGS -msse4.2)
|
|
242
|
+
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
|
243
|
+
endif()
|
|
232
244
|
if (GGML_F16C)
|
|
233
245
|
list(APPEND ARCH_FLAGS -mf16c)
|
|
234
246
|
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
|
@@ -289,23 +301,29 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
289
301
|
endif()
|
|
290
302
|
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
|
291
303
|
message(STATUS "PowerPC detected")
|
|
292
|
-
if(
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
304
|
+
if (GGML_NATIVE)
|
|
305
|
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|
306
|
+
file(READ "/proc/cpuinfo" POWER10_M)
|
|
307
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
|
308
|
+
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
|
309
|
+
endif()
|
|
297
310
|
|
|
298
|
-
|
|
299
|
-
|
|
311
|
+
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
|
312
|
+
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
|
300
313
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
314
|
+
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
|
315
|
+
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
|
|
316
|
+
elseif (EXTRACTED_NUMBER EQUAL 9)
|
|
317
|
+
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
|
|
318
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
|
319
|
+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
|
320
|
+
else()
|
|
321
|
+
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
|
322
|
+
endif()
|
|
307
323
|
else()
|
|
308
|
-
|
|
324
|
+
if (GGML_CPU_POWERPC_CPUTYPE)
|
|
325
|
+
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
|
326
|
+
endif()
|
|
309
327
|
endif()
|
|
310
328
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
311
329
|
message(STATUS "loongarch64 detected")
|
|
@@ -320,7 +338,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
320
338
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
|
321
339
|
message(STATUS "RISC-V detected")
|
|
322
340
|
if (GGML_RVV)
|
|
323
|
-
|
|
341
|
+
if (GGML_RV_ZFH)
|
|
342
|
+
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
|
343
|
+
else()
|
|
344
|
+
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
|
345
|
+
endif()
|
|
324
346
|
endif()
|
|
325
347
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
|
326
348
|
message(STATUS "s390x detected")
|
|
@@ -330,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
330
352
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
331
353
|
if (${S390X_M} MATCHES "8561|8562")
|
|
332
354
|
message(STATUS "z15 target")
|
|
333
|
-
list(APPEND ARCH_FLAGS -march=z15
|
|
355
|
+
list(APPEND ARCH_FLAGS -march=z15)
|
|
334
356
|
elseif (${S390X_M} MATCHES "3931")
|
|
335
357
|
message(STATUS "z16 target")
|
|
336
|
-
list(APPEND ARCH_FLAGS -march=z16
|
|
358
|
+
list(APPEND ARCH_FLAGS -march=z16)
|
|
359
|
+
elseif (${S390X_M} MATCHES "9175|9176")
|
|
360
|
+
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
|
361
|
+
message(STATUS "z17 target")
|
|
362
|
+
list(APPEND ARCH_FLAGS -march=z17)
|
|
337
363
|
else()
|
|
338
364
|
message(STATUS "Unknown target")
|
|
339
365
|
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
|
@@ -359,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
359
385
|
|
|
360
386
|
# Fetch KleidiAI sources:
|
|
361
387
|
include(FetchContent)
|
|
362
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
388
|
+
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
|
|
363
389
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
364
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
390
|
+
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
|
|
365
391
|
|
|
366
392
|
if (POLICY CMP0135)
|
|
367
393
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#include "binary-ops.h"
|
|
2
|
+
|
|
3
|
+
#if defined(GGML_USE_ACCELERATE)
|
|
4
|
+
#include <Accelerate/Accelerate.h>
|
|
5
|
+
|
|
6
|
+
using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
static inline float op_add(float a, float b) {
|
|
10
|
+
return a + b;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static inline float op_sub(float a, float b) {
|
|
14
|
+
return a - b;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static inline float op_mul(float a, float b) {
|
|
18
|
+
return a * b;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
static inline float op_div(float a, float b) {
|
|
22
|
+
return a / b;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
|
26
|
+
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
|
|
27
|
+
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
|
28
|
+
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
|
29
|
+
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
|
30
|
+
|
|
31
|
+
for (int i = 0; i < n; i++) {
|
|
32
|
+
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
|
37
|
+
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
|
|
38
|
+
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
|
39
|
+
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
|
40
|
+
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
|
41
|
+
|
|
42
|
+
for (int i = 0; i < n; i++) {
|
|
43
|
+
int i10 = i % ne10;
|
|
44
|
+
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
|
|
45
|
+
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
|
50
|
+
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
51
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
52
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
53
|
+
|
|
54
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
|
55
|
+
|
|
56
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
57
|
+
|
|
58
|
+
GGML_ASSERT( nb0 == sizeof(dst_t));
|
|
59
|
+
GGML_ASSERT(nb00 == sizeof(src0_t));
|
|
60
|
+
|
|
61
|
+
const auto [ir0, ir1] = get_thread_range(params, src0);
|
|
62
|
+
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
|
|
63
|
+
|
|
64
|
+
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
|
|
65
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#ifdef GGML_USE_ACCELERATE
|
|
69
|
+
vDSP_fn_t vDSP_op = nullptr;
|
|
70
|
+
// TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
|
|
71
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
72
|
+
if (op == op_add) {
|
|
73
|
+
vDSP_op = vDSP_vadd;
|
|
74
|
+
} else if (op == op_sub) {
|
|
75
|
+
vDSP_op = vDSP_vsub;
|
|
76
|
+
} else if (op == op_mul) {
|
|
77
|
+
vDSP_op = vDSP_vmul;
|
|
78
|
+
} else if (op == op_div) {
|
|
79
|
+
vDSP_op = vDSP_vdiv;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
#endif
|
|
83
|
+
|
|
84
|
+
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
|
85
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
86
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
87
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
88
|
+
|
|
89
|
+
const int64_t i13 = i03 % ne13;
|
|
90
|
+
const int64_t i12 = i02 % ne12;
|
|
91
|
+
const int64_t i11 = i01 % ne11;
|
|
92
|
+
|
|
93
|
+
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
94
|
+
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
95
|
+
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
96
|
+
|
|
97
|
+
if (is_src1_contiguous) {
|
|
98
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
|
99
|
+
const int64_t nr0 = ne00 / ne10;
|
|
100
|
+
|
|
101
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
|
102
|
+
#ifdef GGML_USE_ACCELERATE
|
|
103
|
+
if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
|
|
104
|
+
if (vDSP_op != nullptr) {
|
|
105
|
+
vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
#endif
|
|
110
|
+
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
|
111
|
+
}
|
|
112
|
+
} else {
|
|
113
|
+
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
|
119
|
+
template <float (*op)(float, float)>
|
|
120
|
+
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
121
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
122
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
123
|
+
|
|
124
|
+
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
|
125
|
+
apply_binary_op<op, float, float, float>(params, dst);
|
|
126
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
|
127
|
+
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
|
128
|
+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
|
129
|
+
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
|
130
|
+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
|
|
131
|
+
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
|
|
132
|
+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
133
|
+
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
|
|
134
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
135
|
+
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
|
|
136
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
137
|
+
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
|
|
138
|
+
} else {
|
|
139
|
+
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
|
140
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
145
|
+
binary_op<op_add>(params, dst);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
149
|
+
binary_op<op_sub>(params, dst);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
153
|
+
binary_op<op_mul>(params, dst);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
157
|
+
binary_op<op_div>(params, dst);
|
|
158
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "common.h"
|
|
4
|
+
|
|
5
|
+
#ifdef __cplusplus
|
|
6
|
+
extern "C" {
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
10
|
+
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
11
|
+
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
12
|
+
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
13
|
+
|
|
14
|
+
#ifdef __cplusplus
|
|
15
|
+
}
|
|
16
|
+
#endif
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpu-traits.h"
|
|
5
|
+
#include "ggml-cpu-impl.h"
|
|
6
|
+
#include "ggml-impl.h"
|
|
7
|
+
|
|
8
|
+
#ifdef __cplusplus
|
|
9
|
+
|
|
10
|
+
#include <utility>
|
|
11
|
+
|
|
12
|
+
// convenience functions/macros for use in template calls
|
|
13
|
+
// note: these won't be required after the 'traits' lookup table is used.
|
|
14
|
+
static inline ggml_fp16_t f32_to_f16(float x) {
|
|
15
|
+
return GGML_FP32_TO_FP16(x);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
static inline float f16_to_f32(ggml_fp16_t x) {
|
|
19
|
+
return GGML_FP16_TO_FP32(x);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
static inline ggml_bf16_t f32_to_bf16(float x) {
|
|
23
|
+
return GGML_FP32_TO_BF16(x);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
static inline float bf16_to_f32(ggml_bf16_t x) {
|
|
27
|
+
return GGML_BF16_TO_FP32(x);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
static inline float f32_to_f32(float x) {
|
|
31
|
+
return x;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// TODO - merge this into the traits table, after using row-based conversions
|
|
35
|
+
template <class T>
|
|
36
|
+
struct type_conversion_table;
|
|
37
|
+
|
|
38
|
+
template <>
|
|
39
|
+
struct type_conversion_table<ggml_fp16_t> {
|
|
40
|
+
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
|
|
41
|
+
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
template <>
|
|
45
|
+
struct type_conversion_table<float> {
|
|
46
|
+
static constexpr float (*to_f32)(float) = f32_to_f32;
|
|
47
|
+
static constexpr float (*from_f32)(float) = f32_to_f32;
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
template <>
|
|
51
|
+
struct type_conversion_table<ggml_bf16_t> {
|
|
52
|
+
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
|
|
53
|
+
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
|
57
|
+
const int64_t ith = params->ith;
|
|
58
|
+
const int64_t nth = params->nth;
|
|
59
|
+
|
|
60
|
+
const int64_t nr = ggml_nrows(src0);
|
|
61
|
+
|
|
62
|
+
// rows per thread
|
|
63
|
+
const int64_t dr = (nr + nth - 1)/nth;
|
|
64
|
+
|
|
65
|
+
// row range for this thread
|
|
66
|
+
const int64_t ir0 = dr*ith;
|
|
67
|
+
const int64_t ir1 = MIN(ir0 + dr, nr);
|
|
68
|
+
|
|
69
|
+
return {ir0, ir1};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
#endif
|