llama_cpp 0.15.1 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -4,7 +4,6 @@
|
|
4
4
|
|
5
5
|
#include "ggml-cuda/common.cuh"
|
6
6
|
#include "ggml-cuda/acc.cuh"
|
7
|
-
#include "ggml-cuda/alibi.cuh"
|
8
7
|
#include "ggml-cuda/arange.cuh"
|
9
8
|
#include "ggml-cuda/argsort.cuh"
|
10
9
|
#include "ggml-cuda/binbcast.cuh"
|
@@ -44,19 +43,59 @@
|
|
44
43
|
#include <mutex>
|
45
44
|
#include <stdint.h>
|
46
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
47
48
|
#include <string>
|
48
49
|
#include <vector>
|
49
50
|
|
50
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
51
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
52
91
|
[[noreturn]]
|
53
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
54
93
|
int id = -1; // in case cudaGetDevice fails
|
55
94
|
cudaGetDevice(&id);
|
56
95
|
|
57
|
-
|
58
|
-
|
59
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
60
99
|
// abort with GGML_ASSERT to get a stack trace
|
61
100
|
GGML_ASSERT(!"CUDA error");
|
62
101
|
}
|
@@ -92,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
92
131
|
|
93
132
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
94
133
|
if (err != cudaSuccess) {
|
95
|
-
|
134
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
96
135
|
return info;
|
97
136
|
}
|
98
137
|
|
@@ -100,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
100
139
|
|
101
140
|
int64_t total_vram = 0;
|
102
141
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
103
|
-
|
142
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
104
143
|
#else
|
105
|
-
|
144
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
106
145
|
#endif
|
107
146
|
#if defined(CUDA_USE_TENSOR_CORES)
|
108
|
-
|
147
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
109
148
|
#else
|
110
|
-
|
149
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
111
150
|
#endif
|
112
|
-
|
151
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
113
152
|
for (int id = 0; id < info.device_count; ++id) {
|
114
153
|
int device_vmm = 0;
|
115
154
|
|
@@ -130,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
130
169
|
|
131
170
|
cudaDeviceProp prop;
|
132
171
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
133
|
-
|
172
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
134
173
|
|
135
174
|
info.default_tensor_split[id] = total_vram;
|
136
175
|
total_vram += prop.totalGlobalMem;
|
@@ -236,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
236
275
|
*actual_size = look_ahead_size;
|
237
276
|
pool_size += look_ahead_size;
|
238
277
|
#ifdef DEBUG_CUDA_MALLOC
|
239
|
-
|
240
|
-
|
278
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
279
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
241
280
|
#endif
|
242
281
|
return ptr;
|
243
282
|
}
|
@@ -251,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
251
290
|
return;
|
252
291
|
}
|
253
292
|
}
|
254
|
-
|
293
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
255
294
|
ggml_cuda_set_device(device);
|
256
295
|
CUDA_CHECK(cudaFree(ptr));
|
257
296
|
pool_size -= size;
|
@@ -500,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
500
539
|
void * dev_ptr;
|
501
540
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
502
541
|
if (err != cudaSuccess) {
|
503
|
-
|
542
|
+
// clear the error
|
543
|
+
cudaGetLastError();
|
544
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
504
545
|
return nullptr;
|
505
546
|
}
|
506
547
|
|
@@ -1003,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1003
1044
|
if (err != cudaSuccess) {
|
1004
1045
|
// clear the error
|
1005
1046
|
cudaGetLastError();
|
1006
|
-
|
1007
|
-
|
1047
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1048
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1008
1049
|
return nullptr;
|
1009
1050
|
}
|
1010
1051
|
|
@@ -2205,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2205
2246
|
case GGML_UNARY_OP_RELU:
|
2206
2247
|
ggml_cuda_op_relu(ctx, dst);
|
2207
2248
|
break;
|
2249
|
+
case GGML_UNARY_OP_SIGMOID:
|
2250
|
+
ggml_cuda_op_sigmoid(ctx, dst);
|
2251
|
+
break;
|
2208
2252
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2209
2253
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
2210
2254
|
break;
|
@@ -2244,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2244
2288
|
break;
|
2245
2289
|
case GGML_OP_MUL_MAT:
|
2246
2290
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2247
|
-
|
2291
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2248
2292
|
return false;
|
2249
2293
|
} else {
|
2250
2294
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2277,9 +2321,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2277
2321
|
case GGML_OP_ROPE:
|
2278
2322
|
ggml_cuda_op_rope(ctx, dst);
|
2279
2323
|
break;
|
2280
|
-
case GGML_OP_ALIBI:
|
2281
|
-
ggml_cuda_op_alibi(ctx, dst);
|
2282
|
-
break;
|
2283
2324
|
case GGML_OP_IM2COL:
|
2284
2325
|
ggml_cuda_op_im2col(ctx, dst);
|
2285
2326
|
break;
|
@@ -2301,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2301
2342
|
|
2302
2343
|
cudaError_t err = cudaGetLastError();
|
2303
2344
|
if (err != cudaSuccess) {
|
2304
|
-
|
2345
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2305
2346
|
CUDA_CHECK(err);
|
2306
2347
|
}
|
2307
2348
|
|
@@ -2477,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2477
2518
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2478
2519
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2479
2520
|
#ifndef NDEBUG
|
2480
|
-
|
2521
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2481
2522
|
#endif
|
2482
2523
|
}
|
2483
2524
|
}
|
@@ -2524,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2524
2565
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2525
2566
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2526
2567
|
#ifndef NDEBUG
|
2527
|
-
|
2568
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2528
2569
|
#endif
|
2529
2570
|
}
|
2530
2571
|
|
2531
2572
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2532
2573
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2533
2574
|
#ifndef NDEBUG
|
2534
|
-
|
2575
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2535
2576
|
#endif
|
2536
2577
|
}
|
2537
2578
|
|
@@ -2540,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2540
2581
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2541
2582
|
use_cuda_graph = false;
|
2542
2583
|
#ifndef NDEBUG
|
2543
|
-
|
2584
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2544
2585
|
#endif
|
2545
2586
|
}
|
2546
2587
|
|
@@ -2559,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2559
2600
|
}
|
2560
2601
|
|
2561
2602
|
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
2562
|
-
if (cuda_graph_update_required) {
|
2603
|
+
if (use_cuda_graph && cuda_graph_update_required) {
|
2563
2604
|
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
2564
2605
|
} else {
|
2565
2606
|
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
@@ -2568,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2568
2609
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2569
2610
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2570
2611
|
#ifndef NDEBUG
|
2571
|
-
|
2612
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2572
2613
|
#endif
|
2573
2614
|
}
|
2574
2615
|
}
|
@@ -2606,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2606
2647
|
|
2607
2648
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2608
2649
|
if (!ok) {
|
2609
|
-
|
2650
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2610
2651
|
}
|
2611
2652
|
GGML_ASSERT(ok);
|
2612
2653
|
}
|
@@ -2625,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2625
2666
|
use_cuda_graph = false;
|
2626
2667
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2627
2668
|
#ifndef NDEBUG
|
2628
|
-
|
2669
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2629
2670
|
#endif
|
2630
2671
|
} else {
|
2631
2672
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2692,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2692
2733
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2693
2734
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2694
2735
|
#ifndef NDEBUG
|
2695
|
-
|
2736
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2696
2737
|
#endif
|
2697
2738
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2698
2739
|
// so instead clear error and re-instantiate
|
@@ -2714,12 +2755,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2714
2755
|
}
|
2715
2756
|
|
2716
2757
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2758
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
2717
2759
|
switch (op->op) {
|
2718
2760
|
case GGML_OP_UNARY:
|
2719
2761
|
switch (ggml_get_unary_op(op)) {
|
2720
2762
|
case GGML_UNARY_OP_GELU:
|
2721
2763
|
case GGML_UNARY_OP_SILU:
|
2722
2764
|
case GGML_UNARY_OP_RELU:
|
2765
|
+
case GGML_UNARY_OP_SIGMOID:
|
2723
2766
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2724
2767
|
case GGML_UNARY_OP_HARDSWISH:
|
2725
2768
|
case GGML_UNARY_OP_GELU_QUICK:
|
@@ -2829,7 +2872,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2829
2872
|
case GGML_OP_DIAG_MASK_INF:
|
2830
2873
|
case GGML_OP_SOFT_MAX:
|
2831
2874
|
case GGML_OP_ROPE:
|
2832
|
-
case GGML_OP_ALIBI:
|
2833
2875
|
case GGML_OP_IM2COL:
|
2834
2876
|
case GGML_OP_POOL_2D:
|
2835
2877
|
case GGML_OP_SUM_ROWS:
|
@@ -2841,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2841
2883
|
case GGML_OP_ARANGE:
|
2842
2884
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2843
2885
|
case GGML_OP_LEAKY_RELU:
|
2844
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
2845
2886
|
return true;
|
2887
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
2888
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2889
|
+
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
2890
|
+
#else
|
2891
|
+
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
|
2892
|
+
return true;
|
2893
|
+
}
|
2894
|
+
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
|
2895
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2846
2896
|
default:
|
2847
2897
|
return false;
|
2848
2898
|
}
|
@@ -2940,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2940
2990
|
|
2941
2991
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2942
2992
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2943
|
-
|
2993
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2944
2994
|
return nullptr;
|
2945
2995
|
}
|
2946
2996
|
|
2947
2997
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2948
2998
|
if (ctx == nullptr) {
|
2949
|
-
|
2999
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2950
3000
|
return nullptr;
|
2951
3001
|
}
|
2952
3002
|
|
@@ -2990,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2990
3040
|
// clear the error
|
2991
3041
|
cudaGetLastError();
|
2992
3042
|
|
2993
|
-
|
2994
|
-
|
3043
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3044
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
2995
3045
|
return false;
|
2996
3046
|
}
|
2997
3047
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -120,9 +132,16 @@ extern "C" {
|
|
120
132
|
#ifndef __F16C__
|
121
133
|
#define __F16C__
|
122
134
|
#endif
|
135
|
+
#endif
|
136
|
+
|
137
|
+
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
138
|
+
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
123
139
|
#ifndef __SSE3__
|
124
140
|
#define __SSE3__
|
125
141
|
#endif
|
142
|
+
#ifndef __SSSE3__
|
143
|
+
#define __SSSE3__
|
144
|
+
#endif
|
126
145
|
#endif
|
127
146
|
|
128
147
|
// 16-bit float
|
@@ -436,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
436
455
|
#include <riscv_vector.h>
|
437
456
|
#endif
|
438
457
|
|
458
|
+
#if defined(__loongarch64)
|
459
|
+
#if defined(__loongarch_asx)
|
460
|
+
#include <lasxintrin.h>
|
461
|
+
#endif
|
462
|
+
#if defined(__loongarch_sx)
|
463
|
+
#include <lsxintrin.h>
|
464
|
+
#endif
|
465
|
+
#endif
|
466
|
+
|
467
|
+
#if defined(__loongarch_asx)
|
468
|
+
|
469
|
+
typedef union {
|
470
|
+
int32_t i;
|
471
|
+
float f;
|
472
|
+
} ft_union;
|
473
|
+
|
474
|
+
/* float type data load instructions */
|
475
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
476
|
+
ft_union fi_tmpval = {.f = val};
|
477
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
478
|
+
}
|
479
|
+
|
480
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
481
|
+
ft_union fi_tmpval = {.f = val};
|
482
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
483
|
+
}
|
484
|
+
#endif
|
485
|
+
|
439
486
|
#ifdef __F16C__
|
440
487
|
|
441
488
|
#ifdef _MSC_VER
|
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1559
1559
|
case GGML_OP_SOFT_MAX:
|
1560
1560
|
{
|
1561
1561
|
float scale;
|
1562
|
-
|
1562
|
+
float max_bias;
|
1563
1563
|
|
1564
|
-
|
1564
|
+
memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
|
1565
|
+
memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
|
1566
|
+
|
1567
|
+
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
|
1565
1568
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
1566
1569
|
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
1567
|
-
|
1570
|
+
|
1571
|
+
#pragma message("TODO: add ALiBi support")
|
1572
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
1573
|
+
GGML_ASSERT(max_bias == 0.0f);
|
1568
1574
|
|
1569
1575
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
1570
1576
|
} break;
|
@@ -1671,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1671
1677
|
} break;
|
1672
1678
|
case GGML_OP_ROPE:
|
1673
1679
|
{
|
1680
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1681
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1682
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1683
|
+
|
1674
1684
|
GGML_ASSERT(ne10 == ne02);
|
1675
1685
|
GGML_ASSERT(src0t == dstt);
|
1676
1686
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|