@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -111,14 +111,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
111
111
  function(check_arm_feature tag code)
112
112
  set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
113
113
  set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
114
- check_cxx_source_runs(
115
- "${code}"
116
- GGML_MACHINE_SUPPORTS_${tag}
117
- )
114
+ check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
118
115
  if (GGML_MACHINE_SUPPORTS_${tag})
119
116
  set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
120
117
  else()
121
- set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
118
+ set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
119
+ check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
120
+ if (GGML_MACHINE_SUPPORTS_no${tag})
121
+ set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
122
+ endif()
122
123
  endif()
123
124
  set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
124
125
  endfunction()
@@ -126,6 +127,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
126
127
  check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
127
128
  check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
128
129
  check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
130
+ check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
129
131
 
130
132
  list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
131
133
  else()
@@ -150,7 +152,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
150
152
  if (ARM_FEATURE_RESULT)
151
153
  message(WARNING "Failed to get ARM features")
152
154
  else()
153
- foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
155
+ foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
154
156
  string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
155
157
  if (NOT ${feature_pos} EQUAL -1)
156
158
  message(STATUS "ARM feature ${feature} enabled")
@@ -217,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
217
219
  if (GGML_AVX_VNNI)
218
220
  list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
219
221
  endif()
222
+ if (GGML_BMI2)
223
+ # MSVC does not define macro __BMI2__
224
+ list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
225
+ endif()
220
226
  else ()
221
227
  if (GGML_NATIVE)
222
228
  list(APPEND ARCH_FLAGS -march=native)
@@ -231,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
231
237
  list(APPEND ARCH_FLAGS -mfma)
232
238
  list(APPEND ARCH_DEFINITIONS GGML_FMA)
233
239
  endif()
240
+ if (GGML_BMI2)
241
+ list(APPEND ARCH_FLAGS -mbmi2)
242
+ list(APPEND ARCH_DEFINITIONS GGML_BMI2)
243
+ endif()
234
244
  if (GGML_AVX)
235
245
  list(APPEND ARCH_FLAGS -mavx)
236
246
  list(APPEND ARCH_DEFINITIONS GGML_AVX)
@@ -279,19 +289,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
279
289
  endif()
280
290
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
281
291
  message(STATUS "PowerPC detected")
282
- execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
283
- string(FIND "${POWER10_M}" "POWER10" substring_index)
284
- if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
285
- set(substring_index -1)
286
- endif()
287
-
288
- if (${substring_index} GREATER_EQUAL 0)
289
- list(APPEND ARCH_FLAGS -mcpu=power10)
292
+ execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
293
+ if (${POWER_M} MATCHES "POWER10")
294
+ list(APPEND ARCH_FLAGS -mcpu=power10)
295
+ elseif (${POWER_M} MATCHES "POWER9")
296
+ list(APPEND ARCH_FLAGS -mcpu=power9)
290
297
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
291
- list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
298
+ list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
292
299
  else()
293
- list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
294
- # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
300
+ list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
295
301
  endif()
296
302
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
297
303
  message(STATUS "loongarch64 detected")
@@ -308,6 +314,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
308
314
  if (GGML_RVV)
309
315
  list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
310
316
  endif()
317
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
318
+ message(STATUS "s390x detected")
319
+ file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
320
+ string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
321
+
322
+ # TODO: Separation to determine activation of VX/VXE/VXE2
323
+ if (${S390X_M} MATCHES "8561|8562")
324
+ message(STATUS "z15 target")
325
+ list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
326
+ elseif (${S390X_M} MATCHES "3931")
327
+ message(STATUS "z16 target")
328
+ list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
329
+ else()
330
+ message(STATUS "Unknown target")
331
+ message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
332
+ list(APPEND ARCH_FLAGS -march=native -mtune=native)
333
+ endif()
334
+
335
+ if (GGML_VXE)
336
+ list(APPEND ARCH_FLAGS -mvx -mzvector)
337
+ endif()
311
338
  else()
312
339
  message(STATUS "Unknown architecture")
313
340
  endif()
@@ -316,6 +343,94 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
316
343
  target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
317
344
  endif()
318
345
 
346
+ if (GGML_CPU_KLEIDIAI)
347
+ message(STATUS "Using KleidiAI optimized kernels if applicable")
348
+
349
+ # Disable the KleidiAI tests
350
+ set(KLEIDIAI_BUILD_TESTS OFF)
351
+
352
+ # Fetch KleidiAI sources:
353
+ include(FetchContent)
354
+ set(KLEIDIAI_COMMIT_TAG "v1.3.0")
355
+ set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
356
+ set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
357
+
358
+ if (POLICY CMP0135)
359
+ cmake_policy(SET CMP0135 NEW)
360
+ endif()
361
+
362
+ FetchContent_Declare(KleidiAI_Download
363
+ URL ${KLEIDIAI_DOWNLOAD_URL}
364
+ DOWNLOAD_EXTRACT_TIMESTAMP NEW
365
+ URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
366
+
367
+ FetchContent_MakeAvailable(KleidiAI_Download)
368
+ FetchContent_GetProperties(KleidiAI_Download
369
+ SOURCE_DIR KLEIDIAI_SRC
370
+ POPULATED KLEIDIAI_POPULATED)
371
+
372
+ if (NOT KLEIDIAI_POPULATED)
373
+ message(FATAL_ERROR "KleidiAI source downloaded failed.")
374
+ endif()
375
+
376
+ add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
377
+
378
+ # Remove kleidiai target after fetching it
379
+ if (TARGET kleidiai)
380
+ set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
381
+ endif()
382
+
383
+ list(APPEND GGML_CPU_SOURCES
384
+ ggml-cpu/kleidiai/kleidiai.cpp
385
+ ggml-cpu/kleidiai/kernels.cpp
386
+ ggml-cpu/kleidiai/kleidiai.h
387
+ ggml-cpu/kleidiai/kernels.h
388
+ )
389
+
390
+ # KleidiAI
391
+ include_directories(
392
+ ${KLEIDIAI_SRC}/
393
+ ${KLEIDIAI_SRC}/kai/
394
+ ${KLEIDIAI_SRC}/kai/ukernels/
395
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/
396
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
397
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
398
+
399
+ set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
400
+ if (NOT ARCH_FLAGS_TEMP)
401
+ string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
402
+ endif()
403
+ string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
404
+ string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
405
+ string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
406
+
407
+ set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
408
+
409
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
410
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
411
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
412
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
413
+
414
+ if (NOT DOTPROD_ENABLED MATCHES -1)
415
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
416
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
417
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
418
+ endif()
419
+
420
+ if (NOT I8MM_ENABLED MATCHES -1)
421
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
422
+ endif()
423
+
424
+ if (NOT SME_ENABLED MATCHES -1)
425
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
426
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
427
+ set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
428
+ endif()
429
+
430
+ set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
431
+ list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
432
+ endif()
433
+
319
434
  message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
320
435
  target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
321
436
  target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
@@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
50
50
  return (void *) (buffer->context);
51
51
  }
52
52
 
53
- static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
53
+ static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
54
54
  tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
55
55
 
56
56
  GGML_UNUSED(buffer);
57
+ return GGML_STATUS_SUCCESS;
57
58
  }
58
59
 
59
60
  static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
278
278
  if (!is.SSE42()) { return 0; }
279
279
  score += 1<<2;
280
280
  #endif
281
+ #ifdef GGML_BMI2
282
+ if (!is.BMI2()) { return 0; }
283
+ score += 1<<3;
284
+ #endif
281
285
  #ifdef GGML_AVX
282
286
  if (!is.AVX()) { return 0; }
283
287
  score += 1<<4;
@@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
4135
4135
  return nullptr;
4136
4136
  }
4137
4137
 
4138
- static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
4138
+ static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
4139
4139
  tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
4140
4140
 
4141
4141
  GGML_UNUSED(buffer);
4142
+ return GGML_STATUS_SUCCESS;
4142
4143
  }
4143
4144
 
4144
4145
  static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
@@ -59,6 +59,15 @@ struct ggml_compute_params {
59
59
  #endif
60
60
  #endif
61
61
 
62
+ #if defined(__s390x__) && defined(__VEC__)
63
+ #ifndef __VXE__
64
+ #define __VXE__
65
+ #endif
66
+ #ifndef __VXE2__
67
+ #define __VXE2__
68
+ #endif
69
+ #endif
70
+
62
71
  #if defined(__ARM_FEATURE_SVE)
63
72
  #include <arm_sve.h>
64
73
  #include <sys/prctl.h>
@@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
359
368
  #endif
360
369
  #endif
361
370
 
371
+ #if defined(__VXE__) || defined(__VXE2__)
372
+ #include <vecintrin.h>
373
+
374
+ #define vec_neg(a) (-(a)) // Vector Negate
375
+ #define vec_add(a, b) ((a) + (b)) // Vector Add
376
+ #define vec_sub(a, b) ((a) - (b)) // Vector Subtract
377
+ #define vec_mul(a, b) ((a) * (b)) // Vector Multiply
378
+ #define vec_div(a, b) ((a) / (b)) // Vector Divide
379
+ #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
380
+ #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
381
+ #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
382
+ #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
383
+ #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
384
+
385
+ #ifndef vec_and
386
+ #define vec_and(a, b) ((a) & (b)) // Vector AND
387
+ #endif
388
+
389
+ #ifndef vec_or
390
+ #define vec_or(a, b) ((a) | (b)) // Vector OR
391
+ #endif
392
+
393
+ #ifndef vec_xor
394
+ #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
395
+ #endif
396
+
397
+ typedef signed char char8x16_t __attribute__((vector_size(16)));
398
+ typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
399
+
400
+ typedef int8_t int8x16_t __attribute__((vector_size(16)));
401
+ typedef int16_t int16x8_t __attribute__((vector_size(16)));
402
+ typedef int32_t int32x4_t __attribute__((vector_size(16)));
403
+
404
+ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
405
+ typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
406
+ typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
407
+
408
+ typedef float float32x4_t __attribute__((vector_size(16)));
409
+ typedef double double64x2_t __attribute((vector_size(16)));
410
+
411
+ typedef signed long long long64x2_t __attribute((vector_size(16)));
412
+ typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
413
+
414
+ typedef struct ggml_uint8x16x2_t {
415
+ uint8x16_t val[2];
416
+ } ggml_uint8x16x2_t;
417
+
418
+ inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
419
+ ggml_uint8x16x2_t res;
420
+
421
+ res.val[0] = vec_xl( 0, ptr);
422
+ res.val[1] = vec_xl(16, ptr);
423
+
424
+ return res;
425
+ }
426
+
427
+ typedef struct ggml_uint8x16x4_t {
428
+ uint8x16_t val[4];
429
+ } ggml_uint8x16x4_t;
430
+
431
+ inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
432
+ ggml_uint8x16x4_t res;
433
+
434
+ res.val[0] = vec_xl( 0, ptr);
435
+ res.val[1] = vec_xl(16, ptr);
436
+ res.val[2] = vec_xl(32, ptr);
437
+ res.val[3] = vec_xl(48, ptr);
438
+
439
+ return res;
440
+ }
441
+
442
+ typedef struct ggml_int8x16x4_t {
443
+ int8x16_t val[4];
444
+ } ggml_int8x16x4_t;
445
+
446
+ inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
447
+ ggml_int8x16x4_t res;
448
+
449
+ res.val[0] = vec_xl( 0, ptr);
450
+ res.val[1] = vec_xl(16, ptr);
451
+ res.val[2] = vec_xl(32, ptr);
452
+ res.val[3] = vec_xl(48, ptr);
453
+
454
+ return res;
455
+ }
456
+
457
+ typedef struct ggml_int16x8x2_t {
458
+ int16x8_t val[2];
459
+ } ggml_int16x8x2_t;
460
+
461
+ inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
462
+ ggml_int16x8x2_t res;
463
+
464
+ res.val[0] = vec_xl( 0, ptr);
465
+ res.val[1] = vec_xl(16, ptr);
466
+
467
+ return res;
468
+ }
469
+
470
+ /*
471
+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
472
+ ! or iq4_nl for example implementation.
473
+ */
474
+ inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
475
+ int8x16_t res;
476
+
477
+ res[ 0] = a[b[ 0]];
478
+ res[ 1] = a[b[ 1]];
479
+ res[ 2] = a[b[ 2]];
480
+ res[ 3] = a[b[ 3]];
481
+ res[ 4] = a[b[ 4]];
482
+ res[ 5] = a[b[ 5]];
483
+ res[ 6] = a[b[ 6]];
484
+ res[ 7] = a[b[ 7]];
485
+ res[ 8] = a[b[ 8]];
486
+ res[ 9] = a[b[ 9]];
487
+ res[10] = a[b[10]];
488
+ res[11] = a[b[11]];
489
+ res[12] = a[b[12]];
490
+ res[13] = a[b[13]];
491
+ res[14] = a[b[14]];
492
+ res[15] = a[b[15]];
493
+
494
+ return res;
495
+ }
496
+
497
+ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
498
+ const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
499
+ 16, 17, 20, 21, 24, 25, 28, 29 };
500
+
501
+ const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
502
+ const int16x8_t v_abe = vec_perm(a, b, v_maske);
503
+ return v_abo + v_abe;
504
+ }
505
+
506
+ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
507
+ const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
508
+ return acc + (vec_unpackh(p) + vec_unpackl(p));
509
+ }
510
+
511
+ #endif
512
+
362
513
  #if defined(__loongarch_asx)
363
514
  /* float type data load instructions */
364
515
  static __m128 __lsx_vreplfr2vr_s(const float val) {