@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -111,14 +111,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
111
111
|
function(check_arm_feature tag code)
|
|
112
112
|
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
|
113
113
|
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
|
|
114
|
-
check_cxx_source_runs(
|
|
115
|
-
"${code}"
|
|
116
|
-
GGML_MACHINE_SUPPORTS_${tag}
|
|
117
|
-
)
|
|
114
|
+
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
|
|
118
115
|
if (GGML_MACHINE_SUPPORTS_${tag})
|
|
119
116
|
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
|
|
120
117
|
else()
|
|
121
|
-
set(
|
|
118
|
+
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
|
|
119
|
+
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
|
|
120
|
+
if (GGML_MACHINE_SUPPORTS_no${tag})
|
|
121
|
+
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
|
|
122
|
+
endif()
|
|
122
123
|
endif()
|
|
123
124
|
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
|
124
125
|
endfunction()
|
|
@@ -126,6 +127,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
126
127
|
check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
|
|
127
128
|
check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
|
|
128
129
|
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
|
130
|
+
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
|
|
129
131
|
|
|
130
132
|
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
|
|
131
133
|
else()
|
|
@@ -150,7 +152,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
150
152
|
if (ARM_FEATURE_RESULT)
|
|
151
153
|
message(WARNING "Failed to get ARM features")
|
|
152
154
|
else()
|
|
153
|
-
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
|
155
|
+
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
|
154
156
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
|
155
157
|
if (NOT ${feature_pos} EQUAL -1)
|
|
156
158
|
message(STATUS "ARM feature ${feature} enabled")
|
|
@@ -217,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
217
219
|
if (GGML_AVX_VNNI)
|
|
218
220
|
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
|
219
221
|
endif()
|
|
222
|
+
if (GGML_BMI2)
|
|
223
|
+
# MSVC does not define macro __BMI2__
|
|
224
|
+
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
|
|
225
|
+
endif()
|
|
220
226
|
else ()
|
|
221
227
|
if (GGML_NATIVE)
|
|
222
228
|
list(APPEND ARCH_FLAGS -march=native)
|
|
@@ -231,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
231
237
|
list(APPEND ARCH_FLAGS -mfma)
|
|
232
238
|
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
|
233
239
|
endif()
|
|
240
|
+
if (GGML_BMI2)
|
|
241
|
+
list(APPEND ARCH_FLAGS -mbmi2)
|
|
242
|
+
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
|
|
243
|
+
endif()
|
|
234
244
|
if (GGML_AVX)
|
|
235
245
|
list(APPEND ARCH_FLAGS -mavx)
|
|
236
246
|
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
|
@@ -279,19 +289,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
279
289
|
endif()
|
|
280
290
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|
281
291
|
message(STATUS "PowerPC detected")
|
|
282
|
-
execute_process(COMMAND bash -c "grep
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
if (${substring_index} GREATER_EQUAL 0)
|
|
289
|
-
list(APPEND ARCH_FLAGS -mcpu=power10)
|
|
292
|
+
execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
|
|
293
|
+
if (${POWER_M} MATCHES "POWER10")
|
|
294
|
+
list(APPEND ARCH_FLAGS -mcpu=power10)
|
|
295
|
+
elseif (${POWER_M} MATCHES "POWER9")
|
|
296
|
+
list(APPEND ARCH_FLAGS -mcpu=power9)
|
|
290
297
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
|
291
|
-
|
|
298
|
+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
|
292
299
|
else()
|
|
293
|
-
list(APPEND ARCH_FLAGS -mcpu=
|
|
294
|
-
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
|
300
|
+
list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
|
|
295
301
|
endif()
|
|
296
302
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
297
303
|
message(STATUS "loongarch64 detected")
|
|
@@ -308,6 +314,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
308
314
|
if (GGML_RVV)
|
|
309
315
|
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
|
310
316
|
endif()
|
|
317
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
|
318
|
+
message(STATUS "s390x detected")
|
|
319
|
+
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
|
320
|
+
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
|
321
|
+
|
|
322
|
+
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
323
|
+
if (${S390X_M} MATCHES "8561|8562")
|
|
324
|
+
message(STATUS "z15 target")
|
|
325
|
+
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
|
|
326
|
+
elseif (${S390X_M} MATCHES "3931")
|
|
327
|
+
message(STATUS "z16 target")
|
|
328
|
+
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
|
|
329
|
+
else()
|
|
330
|
+
message(STATUS "Unknown target")
|
|
331
|
+
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
|
332
|
+
list(APPEND ARCH_FLAGS -march=native -mtune=native)
|
|
333
|
+
endif()
|
|
334
|
+
|
|
335
|
+
if (GGML_VXE)
|
|
336
|
+
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
337
|
+
endif()
|
|
311
338
|
else()
|
|
312
339
|
message(STATUS "Unknown architecture")
|
|
313
340
|
endif()
|
|
@@ -316,6 +343,94 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
316
343
|
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
|
|
317
344
|
endif()
|
|
318
345
|
|
|
346
|
+
if (GGML_CPU_KLEIDIAI)
|
|
347
|
+
message(STATUS "Using KleidiAI optimized kernels if applicable")
|
|
348
|
+
|
|
349
|
+
# Disable the KleidiAI tests
|
|
350
|
+
set(KLEIDIAI_BUILD_TESTS OFF)
|
|
351
|
+
|
|
352
|
+
# Fetch KleidiAI sources:
|
|
353
|
+
include(FetchContent)
|
|
354
|
+
set(KLEIDIAI_COMMIT_TAG "v1.3.0")
|
|
355
|
+
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
356
|
+
set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
|
|
357
|
+
|
|
358
|
+
if (POLICY CMP0135)
|
|
359
|
+
cmake_policy(SET CMP0135 NEW)
|
|
360
|
+
endif()
|
|
361
|
+
|
|
362
|
+
FetchContent_Declare(KleidiAI_Download
|
|
363
|
+
URL ${KLEIDIAI_DOWNLOAD_URL}
|
|
364
|
+
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
|
365
|
+
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
|
366
|
+
|
|
367
|
+
FetchContent_MakeAvailable(KleidiAI_Download)
|
|
368
|
+
FetchContent_GetProperties(KleidiAI_Download
|
|
369
|
+
SOURCE_DIR KLEIDIAI_SRC
|
|
370
|
+
POPULATED KLEIDIAI_POPULATED)
|
|
371
|
+
|
|
372
|
+
if (NOT KLEIDIAI_POPULATED)
|
|
373
|
+
message(FATAL_ERROR "KleidiAI source downloaded failed.")
|
|
374
|
+
endif()
|
|
375
|
+
|
|
376
|
+
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
|
377
|
+
|
|
378
|
+
# Remove kleidiai target after fetching it
|
|
379
|
+
if (TARGET kleidiai)
|
|
380
|
+
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
|
381
|
+
endif()
|
|
382
|
+
|
|
383
|
+
list(APPEND GGML_CPU_SOURCES
|
|
384
|
+
ggml-cpu/kleidiai/kleidiai.cpp
|
|
385
|
+
ggml-cpu/kleidiai/kernels.cpp
|
|
386
|
+
ggml-cpu/kleidiai/kleidiai.h
|
|
387
|
+
ggml-cpu/kleidiai/kernels.h
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# KleidiAI
|
|
391
|
+
include_directories(
|
|
392
|
+
${KLEIDIAI_SRC}/
|
|
393
|
+
${KLEIDIAI_SRC}/kai/
|
|
394
|
+
${KLEIDIAI_SRC}/kai/ukernels/
|
|
395
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/
|
|
396
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
|
|
397
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
|
|
398
|
+
|
|
399
|
+
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
|
|
400
|
+
if (NOT ARCH_FLAGS_TEMP)
|
|
401
|
+
string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
|
|
402
|
+
endif()
|
|
403
|
+
string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
|
|
404
|
+
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
|
405
|
+
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
|
406
|
+
|
|
407
|
+
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
|
|
408
|
+
|
|
409
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
|
|
410
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
|
|
411
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
|
|
412
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
|
413
|
+
|
|
414
|
+
if (NOT DOTPROD_ENABLED MATCHES -1)
|
|
415
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
|
|
416
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
|
|
417
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
|
|
418
|
+
endif()
|
|
419
|
+
|
|
420
|
+
if (NOT I8MM_ENABLED MATCHES -1)
|
|
421
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
|
|
422
|
+
endif()
|
|
423
|
+
|
|
424
|
+
if (NOT SME_ENABLED MATCHES -1)
|
|
425
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
|
|
426
|
+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
|
|
427
|
+
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
|
428
|
+
endif()
|
|
429
|
+
|
|
430
|
+
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
|
431
|
+
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
|
|
432
|
+
endif()
|
|
433
|
+
|
|
319
434
|
message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
|
|
320
435
|
target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
|
|
321
436
|
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
|
@@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
50
50
|
return (void *) (buffer->context);
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
-
static
|
|
53
|
+
static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
54
54
|
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
|
|
55
55
|
|
|
56
56
|
GGML_UNUSED(buffer);
|
|
57
|
+
return GGML_STATUS_SUCCESS;
|
|
57
58
|
}
|
|
58
59
|
|
|
59
60
|
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
|
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
|
|
|
278
278
|
if (!is.SSE42()) { return 0; }
|
|
279
279
|
score += 1<<2;
|
|
280
280
|
#endif
|
|
281
|
+
#ifdef GGML_BMI2
|
|
282
|
+
if (!is.BMI2()) { return 0; }
|
|
283
|
+
score += 1<<3;
|
|
284
|
+
#endif
|
|
281
285
|
#ifdef GGML_AVX
|
|
282
286
|
if (!is.AVX()) { return 0; }
|
|
283
287
|
score += 1<<4;
|
|
@@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
|
|
|
4135
4135
|
return nullptr;
|
|
4136
4136
|
}
|
|
4137
4137
|
|
|
4138
|
-
static
|
|
4138
|
+
static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
4139
4139
|
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
|
|
4140
4140
|
|
|
4141
4141
|
GGML_UNUSED(buffer);
|
|
4142
|
+
return GGML_STATUS_SUCCESS;
|
|
4142
4143
|
}
|
|
4143
4144
|
|
|
4144
4145
|
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
|
@@ -59,6 +59,15 @@ struct ggml_compute_params {
|
|
|
59
59
|
#endif
|
|
60
60
|
#endif
|
|
61
61
|
|
|
62
|
+
#if defined(__s390x__) && defined(__VEC__)
|
|
63
|
+
#ifndef __VXE__
|
|
64
|
+
#define __VXE__
|
|
65
|
+
#endif
|
|
66
|
+
#ifndef __VXE2__
|
|
67
|
+
#define __VXE2__
|
|
68
|
+
#endif
|
|
69
|
+
#endif
|
|
70
|
+
|
|
62
71
|
#if defined(__ARM_FEATURE_SVE)
|
|
63
72
|
#include <arm_sve.h>
|
|
64
73
|
#include <sys/prctl.h>
|
|
@@ -359,22 +368,158 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
|
359
368
|
#endif
|
|
360
369
|
#endif
|
|
361
370
|
|
|
362
|
-
#if defined(
|
|
371
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
372
|
+
#include <vecintrin.h>
|
|
373
|
+
|
|
374
|
+
#define vec_neg(a) (-(a)) // Vector Negate
|
|
375
|
+
#define vec_add(a, b) ((a) + (b)) // Vector Add
|
|
376
|
+
#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
|
|
377
|
+
#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
|
|
378
|
+
#define vec_div(a, b) ((a) / (b)) // Vector Divide
|
|
379
|
+
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
|
380
|
+
#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
|
|
381
|
+
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
|
|
382
|
+
#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
|
|
383
|
+
#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
|
|
384
|
+
|
|
385
|
+
#ifndef vec_and
|
|
386
|
+
#define vec_and(a, b) ((a) & (b)) // Vector AND
|
|
387
|
+
#endif
|
|
388
|
+
|
|
389
|
+
#ifndef vec_or
|
|
390
|
+
#define vec_or(a, b) ((a) | (b)) // Vector OR
|
|
391
|
+
#endif
|
|
392
|
+
|
|
393
|
+
#ifndef vec_xor
|
|
394
|
+
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
|
395
|
+
#endif
|
|
396
|
+
|
|
397
|
+
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
|
398
|
+
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
|
399
|
+
|
|
400
|
+
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
|
401
|
+
typedef int16_t int16x8_t __attribute__((vector_size(16)));
|
|
402
|
+
typedef int32_t int32x4_t __attribute__((vector_size(16)));
|
|
403
|
+
|
|
404
|
+
typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
|
|
405
|
+
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
|
406
|
+
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
|
407
|
+
|
|
408
|
+
typedef float float32x4_t __attribute__((vector_size(16)));
|
|
409
|
+
typedef double double64x2_t __attribute((vector_size(16)));
|
|
410
|
+
|
|
411
|
+
typedef signed long long long64x2_t __attribute((vector_size(16)));
|
|
412
|
+
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
|
413
|
+
|
|
414
|
+
typedef struct ggml_uint8x16x2_t {
|
|
415
|
+
uint8x16_t val[2];
|
|
416
|
+
} ggml_uint8x16x2_t;
|
|
417
|
+
|
|
418
|
+
inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
|
|
419
|
+
ggml_uint8x16x2_t res;
|
|
420
|
+
|
|
421
|
+
res.val[0] = vec_xl( 0, ptr);
|
|
422
|
+
res.val[1] = vec_xl(16, ptr);
|
|
423
|
+
|
|
424
|
+
return res;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
typedef struct ggml_uint8x16x4_t {
|
|
428
|
+
uint8x16_t val[4];
|
|
429
|
+
} ggml_uint8x16x4_t;
|
|
430
|
+
|
|
431
|
+
inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
|
|
432
|
+
ggml_uint8x16x4_t res;
|
|
433
|
+
|
|
434
|
+
res.val[0] = vec_xl( 0, ptr);
|
|
435
|
+
res.val[1] = vec_xl(16, ptr);
|
|
436
|
+
res.val[2] = vec_xl(32, ptr);
|
|
437
|
+
res.val[3] = vec_xl(48, ptr);
|
|
438
|
+
|
|
439
|
+
return res;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
typedef struct ggml_int8x16x4_t {
|
|
443
|
+
int8x16_t val[4];
|
|
444
|
+
} ggml_int8x16x4_t;
|
|
445
|
+
|
|
446
|
+
inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
|
|
447
|
+
ggml_int8x16x4_t res;
|
|
448
|
+
|
|
449
|
+
res.val[0] = vec_xl( 0, ptr);
|
|
450
|
+
res.val[1] = vec_xl(16, ptr);
|
|
451
|
+
res.val[2] = vec_xl(32, ptr);
|
|
452
|
+
res.val[3] = vec_xl(48, ptr);
|
|
363
453
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
454
|
+
return res;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
typedef struct ggml_int16x8x2_t {
|
|
458
|
+
int16x8_t val[2];
|
|
459
|
+
} ggml_int16x8x2_t;
|
|
460
|
+
|
|
461
|
+
inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
|
|
462
|
+
ggml_int16x8x2_t res;
|
|
463
|
+
|
|
464
|
+
res.val[0] = vec_xl( 0, ptr);
|
|
465
|
+
res.val[1] = vec_xl(16, ptr);
|
|
466
|
+
|
|
467
|
+
return res;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
/*
|
|
471
|
+
! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
|
|
472
|
+
! or iq4_nl for example implementation.
|
|
473
|
+
*/
|
|
474
|
+
inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
|
|
475
|
+
int8x16_t res;
|
|
476
|
+
|
|
477
|
+
res[ 0] = a[b[ 0]];
|
|
478
|
+
res[ 1] = a[b[ 1]];
|
|
479
|
+
res[ 2] = a[b[ 2]];
|
|
480
|
+
res[ 3] = a[b[ 3]];
|
|
481
|
+
res[ 4] = a[b[ 4]];
|
|
482
|
+
res[ 5] = a[b[ 5]];
|
|
483
|
+
res[ 6] = a[b[ 6]];
|
|
484
|
+
res[ 7] = a[b[ 7]];
|
|
485
|
+
res[ 8] = a[b[ 8]];
|
|
486
|
+
res[ 9] = a[b[ 9]];
|
|
487
|
+
res[10] = a[b[10]];
|
|
488
|
+
res[11] = a[b[11]];
|
|
489
|
+
res[12] = a[b[12]];
|
|
490
|
+
res[13] = a[b[13]];
|
|
491
|
+
res[14] = a[b[14]];
|
|
492
|
+
res[15] = a[b[15]];
|
|
368
493
|
|
|
494
|
+
return res;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
498
|
+
const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
|
|
499
|
+
16, 17, 20, 21, 24, 25, 28, 29 };
|
|
500
|
+
|
|
501
|
+
const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
|
|
502
|
+
const int16x8_t v_abe = vec_perm(a, b, v_maske);
|
|
503
|
+
return v_abo + v_abe;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
507
|
+
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
|
508
|
+
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
#endif
|
|
512
|
+
|
|
513
|
+
#if defined(__loongarch_asx)
|
|
369
514
|
/* float type data load instructions */
|
|
370
|
-
static __m128 __lsx_vreplfr2vr_s(float val) {
|
|
371
|
-
|
|
372
|
-
return (__m128)
|
|
515
|
+
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
|
516
|
+
v4f32 res = {val, val, val, val};
|
|
517
|
+
return (__m128)res;
|
|
373
518
|
}
|
|
374
519
|
|
|
375
|
-
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
376
|
-
|
|
377
|
-
return (__m256)
|
|
520
|
+
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
|
521
|
+
v8f32 res = {val, val, val, val, val, val, val, val};
|
|
522
|
+
return (__m256)res;
|
|
378
523
|
}
|
|
379
524
|
#endif
|
|
380
525
|
|