@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
21
21
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
22
22
|
#define CUDA_R_16F HIPBLAS_R_16F
|
|
23
|
+
#define CUDA_R_16BF HIPBLAS_R_16B
|
|
23
24
|
#define CUDA_R_32F HIPBLAS_R_32F
|
|
24
25
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
|
25
26
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
|
@@ -70,6 +71,8 @@
|
|
|
70
71
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
71
72
|
#define cudaMalloc hipMalloc
|
|
72
73
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
74
|
+
#define cudaMallocManaged hipMallocManaged
|
|
75
|
+
#define cudaMemAdvise hipMemAdvise
|
|
73
76
|
#define cudaMemcpy hipMemcpy
|
|
74
77
|
#define cudaMemcpyAsync hipMemcpyAsync
|
|
75
78
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
@@ -151,6 +154,10 @@
|
|
|
151
154
|
#define CDNA
|
|
152
155
|
#endif
|
|
153
156
|
|
|
157
|
+
#if defined(__GFX12__)
|
|
158
|
+
#define RDNA4
|
|
159
|
+
#endif
|
|
160
|
+
|
|
154
161
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
155
162
|
defined(__gfx1150__) || defined(__gfx1151__)
|
|
156
163
|
#define RDNA3
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
|
16
16
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
|
17
17
|
#define CUDA_R_16F MUSA_R_16F
|
|
18
|
+
#define CUDA_R_16BF MUSA_R_16BF
|
|
18
19
|
#define CUDA_R_32F MUSA_R_32F
|
|
19
20
|
#define cublasComputeType_t cudaDataType_t
|
|
20
21
|
#define cublasCreate mublasCreate
|
|
@@ -148,8 +148,14 @@ struct ggml_map_custom2_op_params {
|
|
|
148
148
|
|
|
149
149
|
struct ggml_map_custom3_op_params {
|
|
150
150
|
ggml_custom3_op_t fun;
|
|
151
|
-
int
|
|
152
|
-
void
|
|
151
|
+
int n_tasks;
|
|
152
|
+
void * userdata;
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
struct ggml_custom_op_params {
|
|
156
|
+
ggml_custom_op_t fun;
|
|
157
|
+
int n_tasks;
|
|
158
|
+
void * userdata;
|
|
153
159
|
};
|
|
154
160
|
|
|
155
161
|
// bitset
|
|
@@ -311,29 +317,28 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
311
317
|
|
|
312
318
|
// FP16 to FP32 conversion
|
|
313
319
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
|
|
320
|
+
// 16-bit float
|
|
321
|
+
// on Arm, we use __fp16
|
|
322
|
+
// on x86, we use uint16_t
|
|
323
|
+
//
|
|
324
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
|
325
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
|
326
|
+
//
|
|
327
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
323
328
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
324
329
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
325
330
|
|
|
326
331
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
327
332
|
|
|
328
333
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
329
|
-
|
|
334
|
+
__fp16 tmp;
|
|
330
335
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
331
336
|
return (float)tmp;
|
|
332
337
|
}
|
|
333
338
|
|
|
334
339
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
335
340
|
ggml_fp16_t res;
|
|
336
|
-
|
|
341
|
+
__fp16 tmp = f;
|
|
337
342
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
338
343
|
return res;
|
|
339
344
|
}
|
|
@@ -357,8 +362,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
357
362
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
358
363
|
|
|
359
364
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
360
|
-
|
|
361
|
-
|
|
365
|
+
float f;
|
|
366
|
+
double d;
|
|
362
367
|
__asm__(
|
|
363
368
|
"mtfprd %0,%2\n"
|
|
364
369
|
"xscvhpdp %0,%0\n"
|
|
@@ -370,8 +375,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
370
375
|
}
|
|
371
376
|
|
|
372
377
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
373
|
-
|
|
374
|
-
|
|
378
|
+
double d;
|
|
379
|
+
ggml_fp16_t r;
|
|
375
380
|
__asm__( /* xscvdphp can work on double or single precision */
|
|
376
381
|
"xscvdphp %0,%2\n"
|
|
377
382
|
"mffprd %1,%0\n" :
|
|
@@ -381,6 +386,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
381
386
|
return r;
|
|
382
387
|
}
|
|
383
388
|
|
|
389
|
+
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
|
390
|
+
|
|
391
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
392
|
+
float f;
|
|
393
|
+
__asm__(
|
|
394
|
+
"fmv.h.x %[f], %[h]\n\t"
|
|
395
|
+
"fcvt.s.h %[f], %[f]"
|
|
396
|
+
: [f] "=&f" (f)
|
|
397
|
+
: [h] "r" (h)
|
|
398
|
+
);
|
|
399
|
+
return f;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
403
|
+
ggml_fp16_t res;
|
|
404
|
+
__asm__(
|
|
405
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
|
406
|
+
"fmv.x.h %[h], %[f]"
|
|
407
|
+
: [h] "=&r" (res)
|
|
408
|
+
: [f] "f" (f)
|
|
409
|
+
);
|
|
410
|
+
return res;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
414
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
415
|
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
416
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
417
|
+
|
|
384
418
|
#else
|
|
385
419
|
|
|
386
420
|
// FP16 <-> FP32
|
|
@@ -456,7 +490,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
456
490
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
457
491
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
458
492
|
|
|
459
|
-
#endif // defined(__ARM_NEON) && (!defined(
|
|
493
|
+
#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
460
494
|
|
|
461
495
|
// precomputed f32 table for f16 (256 KB)
|
|
462
496
|
// defined in ggml.c, initialized in ggml_init()
|
|
@@ -1,6 +1,70 @@
|
|
|
1
1
|
#ifndef GGML_METAL_IMPL
|
|
2
2
|
#define GGML_METAL_IMPL
|
|
3
3
|
|
|
4
|
+
// kernel parameters for mat-vec threadgroups
|
|
5
|
+
//
|
|
6
|
+
// N_R0: number of src0 rows to process per simdgroup
|
|
7
|
+
// N_SG: number of simdgroups per threadgroup
|
|
8
|
+
//
|
|
9
|
+
// TODO: for optimal performance, become function of the device and work size
|
|
10
|
+
|
|
11
|
+
#define N_R0_Q4_0 4
|
|
12
|
+
#define N_SG_Q4_0 2
|
|
13
|
+
|
|
14
|
+
#define N_R0_Q4_1 4
|
|
15
|
+
#define N_SG_Q4_1 2
|
|
16
|
+
|
|
17
|
+
#define N_R0_Q5_0 4
|
|
18
|
+
#define N_SG_Q5_0 2
|
|
19
|
+
|
|
20
|
+
#define N_R0_Q5_1 4
|
|
21
|
+
#define N_SG_Q5_1 2
|
|
22
|
+
|
|
23
|
+
#define N_R0_Q8_0 4
|
|
24
|
+
#define N_SG_Q8_0 2
|
|
25
|
+
|
|
26
|
+
#define N_R0_Q2_K 4
|
|
27
|
+
#define N_SG_Q2_K 2
|
|
28
|
+
|
|
29
|
+
#define N_R0_Q3_K 2
|
|
30
|
+
#define N_SG_Q3_K 2
|
|
31
|
+
|
|
32
|
+
#define N_R0_Q4_K 4
|
|
33
|
+
#define N_SG_Q4_K 2
|
|
34
|
+
|
|
35
|
+
#define N_R0_Q5_K 2
|
|
36
|
+
#define N_SG_Q5_K 2
|
|
37
|
+
|
|
38
|
+
#define N_R0_Q6_K 1
|
|
39
|
+
#define N_SG_Q6_K 2
|
|
40
|
+
|
|
41
|
+
#define N_R0_IQ1_S 4
|
|
42
|
+
#define N_SG_IQ1_S 2
|
|
43
|
+
|
|
44
|
+
#define N_R0_IQ1_M 4
|
|
45
|
+
#define N_SG_IQ1_M 2
|
|
46
|
+
|
|
47
|
+
#define N_R0_IQ2_XXS 4
|
|
48
|
+
#define N_SG_IQ2_XXS 2
|
|
49
|
+
|
|
50
|
+
#define N_R0_IQ2_XS 4
|
|
51
|
+
#define N_SG_IQ2_XS 2
|
|
52
|
+
|
|
53
|
+
#define N_R0_IQ2_S 4
|
|
54
|
+
#define N_SG_IQ2_S 2
|
|
55
|
+
|
|
56
|
+
#define N_R0_IQ3_XXS 4
|
|
57
|
+
#define N_SG_IQ3_XXS 2
|
|
58
|
+
|
|
59
|
+
#define N_R0_IQ3_S 4
|
|
60
|
+
#define N_SG_IQ3_S 2
|
|
61
|
+
|
|
62
|
+
#define N_R0_IQ4_NL 2
|
|
63
|
+
#define N_SG_IQ4_NL 2
|
|
64
|
+
|
|
65
|
+
#define N_R0_IQ4_XS 2
|
|
66
|
+
#define N_SG_IQ4_XS 2
|
|
67
|
+
|
|
4
68
|
// kernel argument structs
|
|
5
69
|
//
|
|
6
70
|
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
|
|
@@ -155,9 +219,12 @@ typedef struct {
|
|
|
155
219
|
int32_t ne11;
|
|
156
220
|
int32_t ne_12_2; // assume K and V are same shape
|
|
157
221
|
int32_t ne_12_3;
|
|
158
|
-
uint64_t
|
|
159
|
-
uint64_t
|
|
160
|
-
uint64_t
|
|
222
|
+
uint64_t nb11;
|
|
223
|
+
uint64_t nb12;
|
|
224
|
+
uint64_t nb13;
|
|
225
|
+
uint64_t nb21;
|
|
226
|
+
uint64_t nb22;
|
|
227
|
+
uint64_t nb23;
|
|
161
228
|
uint64_t nb31;
|
|
162
229
|
int32_t ne1;
|
|
163
230
|
int32_t ne2;
|
|
@@ -25,124 +25,72 @@ endif ()
|
|
|
25
25
|
if (GGML_OPENCL_EMBED_KERNELS)
|
|
26
26
|
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
|
|
27
27
|
|
|
28
|
-
set(
|
|
29
|
-
|
|
30
|
-
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
|
|
28
|
+
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
|
29
|
+
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
|
31
30
|
|
|
32
|
-
|
|
33
|
-
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
|
|
34
|
-
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
|
|
35
|
-
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
|
|
36
|
-
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
|
|
37
|
-
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
|
|
38
|
-
|
|
39
|
-
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
|
40
|
-
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
|
41
|
-
|
|
42
|
-
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
|
|
43
|
-
|
|
44
|
-
# Python must be accessible from command line
|
|
45
|
-
add_custom_command(
|
|
46
|
-
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
|
|
47
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
48
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
|
|
49
|
-
${OPENCL_CL_SOURCE_EMBED}
|
|
50
|
-
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
|
|
51
|
-
COMMENT "Generate ggml-opencl.cl.h"
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
add_custom_command(
|
|
55
|
-
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
|
|
56
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
57
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
|
|
58
|
-
${OPENCL_MM_CL_SOURCE_EMBED}
|
|
59
|
-
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
|
|
60
|
-
COMMENT "Generate ggml-opencl_mm.cl.h"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
add_custom_command(
|
|
64
|
-
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
65
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
|
|
67
|
-
${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
68
|
-
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
|
|
69
|
-
COMMENT "Generate ggml-opencl_cvt.cl.h"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
add_custom_command(
|
|
73
|
-
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
74
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
75
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
|
|
76
|
-
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
77
|
-
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
|
|
78
|
-
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
add_custom_command(
|
|
82
|
-
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
83
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
84
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
|
|
85
|
-
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
86
|
-
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
|
|
87
|
-
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
add_custom_command(
|
|
91
|
-
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
92
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
93
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
|
|
94
|
-
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
95
|
-
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
|
|
96
|
-
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
add_custom_command(
|
|
100
|
-
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
101
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
102
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
|
|
103
|
-
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
104
|
-
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
|
|
105
|
-
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
add_custom_command(
|
|
109
|
-
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
110
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
111
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
|
|
112
|
-
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
113
|
-
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
|
|
114
|
-
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
add_custom_command(
|
|
118
|
-
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
|
119
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
120
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
|
|
121
|
-
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
|
122
|
-
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
|
|
123
|
-
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
target_sources(${TARGET_NAME} PRIVATE
|
|
127
|
-
${OPENCL_CL_SOURCE_EMBED}
|
|
128
|
-
${OPENCL_MM_CL_SOURCE_EMBED}
|
|
129
|
-
${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
130
|
-
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
131
|
-
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
132
|
-
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
133
|
-
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
134
|
-
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
135
|
-
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
|
|
136
|
-
else ()
|
|
137
|
-
# copy ggml-opencl.cl to bin directory
|
|
138
|
-
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
|
|
139
|
-
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
|
|
140
|
-
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
|
|
141
|
-
|
|
142
|
-
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
|
|
143
|
-
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
|
|
144
|
-
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
|
|
145
|
-
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
|
|
146
|
-
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
|
|
147
|
-
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
|
|
31
|
+
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
|
148
32
|
endif ()
|
|
33
|
+
|
|
34
|
+
function(ggml_opencl_add_kernel KNAME)
|
|
35
|
+
set(KERN_HDR ${CMAKE_CURRENT_BINARY_DIR}/autogenerated/${KNAME}.cl.h)
|
|
36
|
+
set(KERN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels/${KNAME}.cl)
|
|
37
|
+
|
|
38
|
+
if (GGML_OPENCL_EMBED_KERNELS)
|
|
39
|
+
message(STATUS "opencl: embedding kernel ${KNAME}")
|
|
40
|
+
|
|
41
|
+
# Python must be accessible from command line
|
|
42
|
+
add_custom_command(
|
|
43
|
+
OUTPUT ${KERN_HDR}
|
|
44
|
+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} ${KERN_SRC} ${KERN_HDR}
|
|
45
|
+
DEPENDS ${KERN_SRC} ${EMBED_KERNEL_SCRIPT}
|
|
46
|
+
COMMENT "Generate ${KERN_HDR}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
target_sources(${TARGET_NAME} PRIVATE ${KERN_HDR})
|
|
50
|
+
else ()
|
|
51
|
+
message(STATUS "opencl: adding kernel ${KNAME}")
|
|
52
|
+
configure_file(${KERN_SRC} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${KNAME}.cl COPYONLY)
|
|
53
|
+
endif ()
|
|
54
|
+
endfunction()
|
|
55
|
+
|
|
56
|
+
set(GGML_OPENCL_KERNELS
|
|
57
|
+
add
|
|
58
|
+
clamp
|
|
59
|
+
cpy
|
|
60
|
+
cvt
|
|
61
|
+
diag_mask_inf
|
|
62
|
+
gelu
|
|
63
|
+
gemv_noshuffle_general
|
|
64
|
+
gemv_noshuffle
|
|
65
|
+
get_rows
|
|
66
|
+
im2col_f32
|
|
67
|
+
im2col_f16
|
|
68
|
+
mul_mat_Ab_Bi_8x4
|
|
69
|
+
mul_mv_f16_f16
|
|
70
|
+
mul_mv_f16_f32_1row
|
|
71
|
+
mul_mv_f16_f32_l4
|
|
72
|
+
mul_mv_f16_f32
|
|
73
|
+
mul_mv_f32_f32
|
|
74
|
+
mul_mv_q4_0_f32
|
|
75
|
+
mul_mv_q4_0_f32_v
|
|
76
|
+
mul_mv_q4_0_f32_8x_flat
|
|
77
|
+
mul_mv_q4_0_f32_1d_8x_flat
|
|
78
|
+
mul_mv_q4_0_f32_1d_16x_flat
|
|
79
|
+
mul_mv_q6_k
|
|
80
|
+
mul
|
|
81
|
+
norm
|
|
82
|
+
relu
|
|
83
|
+
rms_norm
|
|
84
|
+
rope
|
|
85
|
+
scale
|
|
86
|
+
silu
|
|
87
|
+
softmax_4_f32
|
|
88
|
+
softmax_4_f16
|
|
89
|
+
softmax_f32
|
|
90
|
+
softmax_f16
|
|
91
|
+
transpose
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
foreach (K ${GGML_OPENCL_KERNELS})
|
|
95
|
+
ggml_opencl_add_kernel(${K})
|
|
96
|
+
endforeach()
|