@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -0,0 +1,17 @@
1
+ // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
+ // SPDX-License-Identifier: MIT
3
+ //
4
+
5
+ #pragma once
6
+
7
+ #include "ggml-alloc.h"
8
+
9
+ #ifdef __cplusplus
10
+ extern "C" {
11
+ #endif
12
+
13
+ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void);
14
+
15
+ #ifdef __cplusplus
16
+ }
17
+ #endif
@@ -7,7 +7,7 @@ if (CUDAToolkit_FOUND)
7
7
 
8
8
  if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
9
9
  # native == GPUs available at build time
10
- # 52 == Maxwell, lowest CUDA 12 standard
10
+ # 50 == Maxwell, lowest CUDA 12 standard
11
11
  # 60 == P100, FP16 CUDA intrinsics
12
12
  # 61 == Pascal, __dp4a instruction (per-byte integer dot product)
13
13
  # 70 == V100, FP16 tensor cores
@@ -17,7 +17,7 @@ if (CUDAToolkit_FOUND)
17
17
  elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
18
18
  set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
19
19
  else()
20
- set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75;80")
20
+ set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
21
21
  endif()
22
22
  endif()
23
23
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -69,6 +69,10 @@ if (CUDAToolkit_FOUND)
69
69
  add_compile_definitions(GGML_CUDA_NO_VMM)
70
70
  endif()
71
71
 
72
+ if (NOT GGML_CUDA_FA)
73
+ add_compile_definitions(GGML_CUDA_NO_FA)
74
+ endif()
75
+
72
76
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
73
77
  add_compile_definitions(GGML_CUDA_F16)
74
78
  endif()
@@ -98,6 +102,15 @@ if (CUDAToolkit_FOUND)
98
102
 
99
103
  set(CUDA_FLAGS -use_fast_math)
100
104
 
105
+ if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
106
+ # Options are:
107
+ # - none (not recommended)
108
+ # - speed (nvcc's default)
109
+ # - balance
110
+ # - size
111
+ list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
112
+ endif()
113
+
101
114
  if (GGML_FATAL_WARNINGS)
102
115
  list(APPEND CUDA_FLAGS -Werror all-warnings)
103
116
  endif()
@@ -39,6 +39,12 @@ endif()
39
39
  find_package(hip REQUIRED)
40
40
  find_package(hipblas REQUIRED)
41
41
  find_package(rocblas REQUIRED)
42
+ if (GGML_HIP_ROCWMMA_FATTN)
43
+ CHECK_INCLUDE_FILE_CXX("rocwmma/rocwmma.hpp" FOUND_ROCWMMA)
44
+ if (NOT ${FOUND_ROCWMMA})
45
+ message(FATAL_ERROR "rocwmma has not been found")
46
+ endif()
47
+ endif()
42
48
 
43
49
  if (${hip_VERSION} VERSION_LESS 5.5)
44
50
  message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
@@ -107,6 +113,14 @@ if (GGML_HIP_NO_VMM)
107
113
  add_compile_definitions(GGML_HIP_NO_VMM)
108
114
  endif()
109
115
 
116
+ if (GGML_HIP_ROCWMMA_FATTN)
117
+ add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
118
+ endif()
119
+
120
+ if (NOT GGML_CUDA_FA)
121
+ add_compile_definitions(GGML_CUDA_NO_FA)
122
+ endif()
123
+
110
124
  if (CXX_IS_HIPCC)
111
125
  set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
112
126
  target_link_libraries(ggml-hip PRIVATE hip::device)
@@ -16,7 +16,7 @@
16
16
  #include <arm_sve.h>
17
17
  #endif // __ARM_FEATURE_SVE
18
18
 
19
- #if defined(__ARM_NEON) && !defined(__CUDACC__)
19
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
20
20
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
21
21
  //
22
22
  // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
@@ -27,12 +27,12 @@ configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
27
27
  configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
28
28
  configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
29
29
 
30
+ set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
30
31
  if (GGML_METAL_EMBED_LIBRARY)
31
32
  enable_language(ASM)
32
33
 
33
34
  add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
34
35
 
35
- set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
36
36
  set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
37
37
  set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
38
38
 
@@ -88,12 +88,11 @@ else()
88
88
 
89
89
  add_custom_command(
90
90
  OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
91
- COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
92
- COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
93
- COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
91
+ COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
92
+ xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
94
93
  COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
95
94
  COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
96
- DEPENDS ggml-metal.metal ggml-common.h
95
+ DEPENDS ggml-metal.metal ${METALLIB_COMMON}
97
96
  COMMENT "Compiling Metal kernels"
98
97
  )
99
98
 
@@ -285,4 +285,239 @@ typedef struct {
285
285
  float eps;
286
286
  } ggml_metal_kargs_rms_norm;
287
287
 
288
+ typedef struct {
289
+ int64_t ne00;
290
+ int64_t ne01;
291
+ int64_t ne02;
292
+ uint64_t nb00;
293
+ uint64_t nb01;
294
+ uint64_t nb02;
295
+ int32_t n_groups;
296
+ float eps;
297
+ } ggml_metal_kargs_group_norm;
298
+
299
+ typedef struct {
300
+ int32_t IC;
301
+ int32_t IL;
302
+ int32_t K;
303
+ int32_t s0;
304
+ uint64_t nb0;
305
+ uint64_t nb1;
306
+ } ggml_metal_kargs_conv_transpose_1d;
307
+
308
+ typedef struct {
309
+ uint64_t ofs0;
310
+ uint64_t ofs1;
311
+ int32_t IW;
312
+ int32_t IH;
313
+ int32_t CHW;
314
+ int32_t s0;
315
+ int32_t s1;
316
+ int32_t p0;
317
+ int32_t p1;
318
+ int32_t d0;
319
+ int32_t d1;
320
+ int32_t N;
321
+ int32_t KH;
322
+ int32_t KW;
323
+ int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources
324
+ } ggml_metal_kargs_im2col;
325
+
326
+ typedef struct {
327
+ int64_t ne00;
328
+ int64_t ne01;
329
+ int64_t ne02;
330
+ int64_t ne03;
331
+ uint64_t nb00;
332
+ uint64_t nb01;
333
+ uint64_t nb02;
334
+ uint64_t nb03;
335
+ int64_t ne10;
336
+ int64_t ne11;
337
+ int64_t ne12;
338
+ int64_t ne13;
339
+ uint64_t nb10;
340
+ uint64_t nb11;
341
+ uint64_t nb12;
342
+ uint64_t nb13;
343
+ int64_t ne0;
344
+ int64_t ne1;
345
+ int64_t ne2;
346
+ int64_t ne3;
347
+ uint64_t nb0;
348
+ uint64_t nb1;
349
+ uint64_t nb2;
350
+ uint64_t nb3;
351
+ } ggml_metal_kargs_sum_rows;
352
+
353
+ typedef struct {
354
+ int64_t ne00;
355
+ int64_t ne01;
356
+ int64_t ne02;
357
+ float scale;
358
+ float max_bias;
359
+ float m0;
360
+ float m1;
361
+ uint32_t n_head_log2;
362
+ } ggml_metal_kargs_soft_max;
363
+
364
+ typedef struct {
365
+ int64_t ne00;
366
+ int64_t ne01;
367
+ int n_past;
368
+ } ggml_metal_kargs_diag_mask_inf;
369
+
370
+ typedef struct {
371
+ int64_t ne00;
372
+ int64_t ne01;
373
+ int64_t ne02;
374
+ uint64_t nb00;
375
+ uint64_t nb01;
376
+ uint64_t nb02;
377
+ int64_t ne10;
378
+ int64_t ne11;
379
+ uint64_t nb10;
380
+ uint64_t nb11;
381
+ int64_t ne0;
382
+ int64_t ne1;
383
+ int64_t ne2;
384
+ uint64_t nb0;
385
+ uint64_t nb1;
386
+ uint64_t nb2;
387
+ } ggml_metal_kargs_ssm_conv;
388
+
389
+ typedef struct {
390
+ int64_t d_state;
391
+ int64_t d_inner;
392
+ int64_t n_seq_tokens;
393
+ int64_t n_seqs;
394
+ uint64_t nb00;
395
+ uint64_t nb01;
396
+ uint64_t nb02;
397
+ uint64_t nb10;
398
+ uint64_t nb11;
399
+ uint64_t nb12;
400
+ uint64_t nb13;
401
+ uint64_t nb20;
402
+ uint64_t nb21;
403
+ uint64_t nb22;
404
+ uint64_t nb30;
405
+ uint64_t nb31;
406
+ uint64_t nb40;
407
+ uint64_t nb41;
408
+ uint64_t nb42;
409
+ uint64_t nb50;
410
+ uint64_t nb51;
411
+ uint64_t nb52;
412
+ } ggml_metal_kargs_ssm_scan;
413
+
414
+ typedef struct {
415
+ int64_t ne00;
416
+ uint64_t nb01;
417
+ uint64_t nb02;
418
+ int64_t ne10;
419
+ uint64_t nb10;
420
+ uint64_t nb11;
421
+ uint64_t nb1;
422
+ uint64_t nb2;
423
+ } ggml_metal_kargs_get_rows;
424
+
425
+ typedef struct {
426
+ int64_t ne00;
427
+ int64_t ne01;
428
+ int64_t ne02;
429
+ int64_t ne03;
430
+ uint64_t nb00;
431
+ uint64_t nb01;
432
+ uint64_t nb02;
433
+ uint64_t nb03;
434
+ int64_t ne0;
435
+ int64_t ne1;
436
+ int64_t ne2;
437
+ int64_t ne3;
438
+ uint64_t nb0;
439
+ uint64_t nb1;
440
+ uint64_t nb2;
441
+ uint64_t nb3;
442
+ float sf0;
443
+ float sf1;
444
+ float sf2;
445
+ float sf3;
446
+ } ggml_metal_kargs_upscale;
447
+
448
+ typedef struct {
449
+ int64_t ne00;
450
+ int64_t ne01;
451
+ int64_t ne02;
452
+ int64_t ne03;
453
+ uint64_t nb00;
454
+ uint64_t nb01;
455
+ uint64_t nb02;
456
+ uint64_t nb03;
457
+ int64_t ne0;
458
+ int64_t ne1;
459
+ int64_t ne2;
460
+ int64_t ne3;
461
+ uint64_t nb0;
462
+ uint64_t nb1;
463
+ uint64_t nb2;
464
+ uint64_t nb3;
465
+ } ggml_metal_kargs_pad;
466
+
467
+ typedef struct {
468
+ int64_t ne00;
469
+ int64_t ne01;
470
+ int64_t ne02;
471
+ int64_t ne03;
472
+ uint64_t nb00;
473
+ uint64_t nb01;
474
+ uint64_t nb02;
475
+ uint64_t nb03;
476
+ int64_t ne0;
477
+ int64_t ne1;
478
+ int64_t ne2;
479
+ int64_t ne3;
480
+ uint64_t nb0;
481
+ uint64_t nb1;
482
+ uint64_t nb2;
483
+ uint64_t nb3;
484
+ int32_t p0;
485
+ int32_t p1;
486
+ } ggml_metal_kargs_pad_reflect_1d;
487
+
488
+ typedef struct {
489
+ uint64_t nb1;
490
+ int dim;
491
+ int max_period;
492
+ } ggml_metal_kargs_timestep_embedding;
493
+
494
+ typedef struct {
495
+ float slope;
496
+ } ggml_metal_kargs_leaky_relu;
497
+
498
+ typedef struct {
499
+ int64_t ncols;
500
+ int64_t ncols_pad;
501
+ } ggml_metal_kargs_argsort;
502
+
503
+ typedef struct {
504
+ int64_t ne0;
505
+ float start;
506
+ float step;
507
+ } ggml_metal_kargs_arange;
508
+
509
+ typedef struct {
510
+ int32_t k0;
511
+ int32_t k1;
512
+ int32_t s0;
513
+ int32_t s1;
514
+ int32_t p0;
515
+ int32_t p1;
516
+ int64_t IH;
517
+ int64_t IW;
518
+ int64_t OH;
519
+ int64_t OW;
520
+ int64_t parallel_elements;
521
+ } ggml_metal_kargs_pool_2d;
522
+
288
523
  #endif // GGML_METAL_IMPL
@@ -21,7 +21,7 @@ if (MUSAToolkit_FOUND)
21
21
  message(STATUS "MUSA Toolkit found")
22
22
 
23
23
  if (NOT DEFINED MUSA_ARCHITECTURES)
24
- set(MUSA_ARCHITECTURES "21;22")
24
+ set(MUSA_ARCHITECTURES "21;22;31")
25
25
  endif()
26
26
  message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
27
27
 
@@ -49,7 +49,7 @@ if (MUSAToolkit_FOUND)
49
49
 
50
50
  set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
51
51
  foreach(SOURCE ${GGML_SOURCES_MUSA})
52
- set(COMPILE_FLAGS "-x musa -mtgpu")
52
+ set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
53
53
  foreach(ARCH ${MUSA_ARCHITECTURES})
54
54
  set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
55
55
  endforeach()
@@ -83,6 +83,10 @@ if (MUSAToolkit_FOUND)
83
83
  add_compile_definitions(GGML_CUDA_NO_VMM)
84
84
  endif()
85
85
 
86
+ if (NOT GGML_CUDA_FA)
87
+ add_compile_definitions(GGML_CUDA_NO_FA)
88
+ endif()
89
+
86
90
  if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
87
91
  add_compile_definitions(GGML_CUDA_F16)
88
92
  endif()
@@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
15
15
  endif ()
16
16
 
17
17
  add_compile_definitions(GGML_OPENCL_SOA_Q)
18
+ add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
18
19
 
19
20
  if (GGML_OPENCL_USE_ADRENO_KERNELS)
20
21
  message(STATUS "OpenCL will use matmul kernels optimized for Adreno")