@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -20,6 +20,7 @@
20
20
  #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
21
21
  #define CUBLAS_TF32_TENSOR_OP_MATH 0
22
22
  #define CUDA_R_16F HIPBLAS_R_16F
23
+ #define CUDA_R_16BF HIPBLAS_R_16B
23
24
  #define CUDA_R_32F HIPBLAS_R_32F
24
25
  #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
25
26
  #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
@@ -70,6 +71,8 @@
70
71
  #define cudaLaunchHostFunc hipLaunchHostFunc
71
72
  #define cudaMalloc hipMalloc
72
73
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
74
+ #define cudaMallocManaged hipMallocManaged
75
+ #define cudaMemAdvise hipMemAdvise
73
76
  #define cudaMemcpy hipMemcpy
74
77
  #define cudaMemcpyAsync hipMemcpyAsync
75
78
  #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
@@ -151,6 +154,10 @@
151
154
  #define CDNA
152
155
  #endif
153
156
 
157
+ #if defined(__GFX12__)
158
+ #define RDNA4
159
+ #endif
160
+
154
161
  #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
155
162
  defined(__gfx1150__) || defined(__gfx1151__)
156
163
  #define RDNA3
@@ -15,6 +15,7 @@
15
15
  #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
16
16
  #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
17
17
  #define CUDA_R_16F MUSA_R_16F
18
+ #define CUDA_R_16BF MUSA_R_16BF
18
19
  #define CUDA_R_32F MUSA_R_32F
19
20
  #define cublasComputeType_t cudaDataType_t
20
21
  #define cublasCreate mublasCreate
@@ -89,10 +89,6 @@ endif()
89
89
 
90
90
  add_compile_definitions(GGML_USE_HIP)
91
91
 
92
- if (GGML_HIP_UMA)
93
- add_compile_definitions(GGML_HIP_UMA)
94
- endif()
95
-
96
92
  if (GGML_CUDA_FORCE_MMQ)
97
93
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
98
94
  endif()
@@ -148,8 +148,14 @@ struct ggml_map_custom2_op_params {
148
148
 
149
149
  struct ggml_map_custom3_op_params {
150
150
  ggml_custom3_op_t fun;
151
- int n_tasks;
152
- void * userdata;
151
+ int n_tasks;
152
+ void * userdata;
153
+ };
154
+
155
+ struct ggml_custom_op_params {
156
+ ggml_custom_op_t fun;
157
+ int n_tasks;
158
+ void * userdata;
153
159
  };
154
160
 
155
161
  // bitset
@@ -311,29 +317,28 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
311
317
 
312
318
  // FP16 to FP32 conversion
313
319
 
314
- #if defined(__ARM_NEON)
315
- #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316
- typedef uint16_t ggml_fp16_internal_t;
317
- #else
318
- typedef __fp16 ggml_fp16_internal_t;
319
- #endif
320
- #endif
321
-
322
- #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
320
+ // 16-bit float
321
+ // on Arm, we use __fp16
322
+ // on x86, we use uint16_t
323
+ //
324
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
325
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
326
+ //
327
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
323
328
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
324
329
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
325
330
 
326
331
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
327
332
 
328
333
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
329
- ggml_fp16_internal_t tmp;
334
+ __fp16 tmp;
330
335
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
331
336
  return (float)tmp;
332
337
  }
333
338
 
334
339
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
335
340
  ggml_fp16_t res;
336
- ggml_fp16_internal_t tmp = f;
341
+ __fp16 tmp = f;
337
342
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
338
343
  return res;
339
344
  }
@@ -357,8 +362,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
357
362
  #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
358
363
 
359
364
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
360
- register float f;
361
- register double d;
365
+ float f;
366
+ double d;
362
367
  __asm__(
363
368
  "mtfprd %0,%2\n"
364
369
  "xscvhpdp %0,%0\n"
@@ -370,8 +375,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
370
375
  }
371
376
 
372
377
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
373
- register double d;
374
- register ggml_fp16_t r;
378
+ double d;
379
+ ggml_fp16_t r;
375
380
  __asm__( /* xscvdphp can work on double or single precision */
376
381
  "xscvdphp %0,%2\n"
377
382
  "mffprd %1,%0\n" :
@@ -381,6 +386,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
381
386
  return r;
382
387
  }
383
388
 
389
+ #elif defined(__riscv) && defined(GGML_RV_ZFH)
390
+
391
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
392
+ float f;
393
+ __asm__(
394
+ "fmv.h.x %[f], %[h]\n\t"
395
+ "fcvt.s.h %[f], %[f]"
396
+ : [f] "=&f" (f)
397
+ : [h] "r" (h)
398
+ );
399
+ return f;
400
+ }
401
+
402
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
403
+ ggml_fp16_t res;
404
+ __asm__(
405
+ "fcvt.h.s %[f], %[f]\n\t"
406
+ "fmv.x.h %[h], %[f]"
407
+ : [h] "=&r" (res)
408
+ : [f] "f" (f)
409
+ );
410
+ return res;
411
+ }
412
+
413
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
414
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
415
+ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
416
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
417
+
384
418
  #else
385
419
 
386
420
  // FP16 <-> FP32
@@ -456,7 +490,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
456
490
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
457
491
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
458
492
 
459
- #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
493
+ #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
460
494
 
461
495
  // precomputed f32 table for f16 (256 KB)
462
496
  // defined in ggml.c, initialized in ggml_init()
@@ -1,6 +1,70 @@
1
1
  #ifndef GGML_METAL_IMPL
2
2
  #define GGML_METAL_IMPL
3
3
 
4
+ // kernel parameters for mat-vec threadgroups
5
+ //
6
+ // N_R0: number of src0 rows to process per simdgroup
7
+ // N_SG: number of simdgroups per threadgroup
8
+ //
9
+ // TODO: for optimal performance, become function of the device and work size
10
+
11
+ #define N_R0_Q4_0 4
12
+ #define N_SG_Q4_0 2
13
+
14
+ #define N_R0_Q4_1 4
15
+ #define N_SG_Q4_1 2
16
+
17
+ #define N_R0_Q5_0 4
18
+ #define N_SG_Q5_0 2
19
+
20
+ #define N_R0_Q5_1 4
21
+ #define N_SG_Q5_1 2
22
+
23
+ #define N_R0_Q8_0 4
24
+ #define N_SG_Q8_0 2
25
+
26
+ #define N_R0_Q2_K 4
27
+ #define N_SG_Q2_K 2
28
+
29
+ #define N_R0_Q3_K 2
30
+ #define N_SG_Q3_K 2
31
+
32
+ #define N_R0_Q4_K 4
33
+ #define N_SG_Q4_K 2
34
+
35
+ #define N_R0_Q5_K 2
36
+ #define N_SG_Q5_K 2
37
+
38
+ #define N_R0_Q6_K 1
39
+ #define N_SG_Q6_K 2
40
+
41
+ #define N_R0_IQ1_S 4
42
+ #define N_SG_IQ1_S 2
43
+
44
+ #define N_R0_IQ1_M 4
45
+ #define N_SG_IQ1_M 2
46
+
47
+ #define N_R0_IQ2_XXS 4
48
+ #define N_SG_IQ2_XXS 2
49
+
50
+ #define N_R0_IQ2_XS 4
51
+ #define N_SG_IQ2_XS 2
52
+
53
+ #define N_R0_IQ2_S 4
54
+ #define N_SG_IQ2_S 2
55
+
56
+ #define N_R0_IQ3_XXS 4
57
+ #define N_SG_IQ3_XXS 2
58
+
59
+ #define N_R0_IQ3_S 4
60
+ #define N_SG_IQ3_S 2
61
+
62
+ #define N_R0_IQ4_NL 2
63
+ #define N_SG_IQ4_NL 2
64
+
65
+ #define N_R0_IQ4_XS 2
66
+ #define N_SG_IQ4_XS 2
67
+
4
68
  // kernel argument structs
5
69
  //
6
70
  // - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -155,9 +219,12 @@ typedef struct {
155
219
  int32_t ne11;
156
220
  int32_t ne_12_2; // assume K and V are same shape
157
221
  int32_t ne_12_3;
158
- uint64_t nb_12_1;
159
- uint64_t nb_12_2;
160
- uint64_t nb_12_3;
222
+ uint64_t nb11;
223
+ uint64_t nb12;
224
+ uint64_t nb13;
225
+ uint64_t nb21;
226
+ uint64_t nb22;
227
+ uint64_t nb23;
161
228
  uint64_t nb31;
162
229
  int32_t ne1;
163
230
  int32_t ne2;
@@ -25,124 +25,72 @@ endif ()
25
25
  if (GGML_OPENCL_EMBED_KERNELS)
26
26
  add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
27
27
 
28
- set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
29
- set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
30
- set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
28
+ set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
29
+ file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
31
30
 
32
- set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
33
- set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
34
- set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
35
- set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
36
- set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
37
- set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
38
-
39
- set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
40
- file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
41
-
42
- include_directories("${CMAKE_BINARY_DIR}/autogenerated")
43
-
44
- # Python must be accessible from command line
45
- add_custom_command(
46
- OUTPUT ${OPENCL_CL_SOURCE_EMBED}
47
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
48
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
49
- ${OPENCL_CL_SOURCE_EMBED}
50
- DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
51
- COMMENT "Generate ggml-opencl.cl.h"
52
- )
53
-
54
- add_custom_command(
55
- OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
56
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
57
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
58
- ${OPENCL_MM_CL_SOURCE_EMBED}
59
- DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
60
- COMMENT "Generate ggml-opencl_mm.cl.h"
61
- )
62
-
63
- add_custom_command(
64
- OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
65
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
66
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
67
- ${OPENCL_CVT_CL_SOURCE_EMBED}
68
- DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
69
- COMMENT "Generate ggml-opencl_cvt.cl.h"
70
- )
71
-
72
- add_custom_command(
73
- OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
74
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
75
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
76
- ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
77
- DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
78
- COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
79
- )
80
-
81
- add_custom_command(
82
- OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
83
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
84
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
85
- ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
86
- DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
87
- COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
88
- )
89
-
90
- add_custom_command(
91
- OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
92
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
93
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
94
- ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
95
- DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
96
- COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
97
- )
98
-
99
- add_custom_command(
100
- OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
101
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
102
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
103
- ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
104
- DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
105
- COMMENT "Generate ggml-opencl_transpose_16.cl.h"
106
- )
107
-
108
- add_custom_command(
109
- OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
110
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
111
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
112
- ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
113
- DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
114
- COMMENT "Generate ggml-opencl_transpose_32.cl.h"
115
- )
116
-
117
- add_custom_command(
118
- OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
119
- COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
120
- ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
121
- ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
122
- DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
123
- COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
124
- )
125
-
126
- target_sources(${TARGET_NAME} PRIVATE
127
- ${OPENCL_CL_SOURCE_EMBED}
128
- ${OPENCL_MM_CL_SOURCE_EMBED}
129
- ${OPENCL_CVT_CL_SOURCE_EMBED}
130
- ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
131
- ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
132
- ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
133
- ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
134
- ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
135
- ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
136
- else ()
137
- # copy ggml-opencl.cl to bin directory
138
- configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
139
- configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
140
- configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
141
-
142
- configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
143
- configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
144
- configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
145
- configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
146
- configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
147
- configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
31
+ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
148
32
  endif ()
33
+
34
+ function(ggml_opencl_add_kernel KNAME)
35
+ set(KERN_HDR ${CMAKE_CURRENT_BINARY_DIR}/autogenerated/${KNAME}.cl.h)
36
+ set(KERN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels/${KNAME}.cl)
37
+
38
+ if (GGML_OPENCL_EMBED_KERNELS)
39
+ message(STATUS "opencl: embedding kernel ${KNAME}")
40
+
41
+ # Python must be accessible from command line
42
+ add_custom_command(
43
+ OUTPUT ${KERN_HDR}
44
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} ${KERN_SRC} ${KERN_HDR}
45
+ DEPENDS ${KERN_SRC} ${EMBED_KERNEL_SCRIPT}
46
+ COMMENT "Generate ${KERN_HDR}"
47
+ )
48
+
49
+ target_sources(${TARGET_NAME} PRIVATE ${KERN_HDR})
50
+ else ()
51
+ message(STATUS "opencl: adding kernel ${KNAME}")
52
+ configure_file(${KERN_SRC} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${KNAME}.cl COPYONLY)
53
+ endif ()
54
+ endfunction()
55
+
56
+ set(GGML_OPENCL_KERNELS
57
+ add
58
+ clamp
59
+ cpy
60
+ cvt
61
+ diag_mask_inf
62
+ gelu
63
+ gemv_noshuffle_general
64
+ gemv_noshuffle
65
+ get_rows
66
+ im2col_f32
67
+ im2col_f16
68
+ mul_mat_Ab_Bi_8x4
69
+ mul_mv_f16_f16
70
+ mul_mv_f16_f32_1row
71
+ mul_mv_f16_f32_l4
72
+ mul_mv_f16_f32
73
+ mul_mv_f32_f32
74
+ mul_mv_q4_0_f32
75
+ mul_mv_q4_0_f32_v
76
+ mul_mv_q4_0_f32_8x_flat
77
+ mul_mv_q4_0_f32_1d_8x_flat
78
+ mul_mv_q4_0_f32_1d_16x_flat
79
+ mul_mv_q6_k
80
+ mul
81
+ norm
82
+ relu
83
+ rms_norm
84
+ rope
85
+ scale
86
+ silu
87
+ softmax_4_f32
88
+ softmax_4_f16
89
+ softmax_f32
90
+ softmax_f16
91
+ transpose
92
+ )
93
+
94
+ foreach (K ${GGML_OPENCL_KERNELS})
95
+ ggml_opencl_add_kernel(${K})
96
+ endforeach()