llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -4,7 +4,6 @@
|
|
4
4
|
|
5
5
|
#include "ggml-cuda/common.cuh"
|
6
6
|
#include "ggml-cuda/acc.cuh"
|
7
|
-
#include "ggml-cuda/alibi.cuh"
|
8
7
|
#include "ggml-cuda/arange.cuh"
|
9
8
|
#include "ggml-cuda/argsort.cuh"
|
10
9
|
#include "ggml-cuda/binbcast.cuh"
|
@@ -44,19 +43,59 @@
|
|
44
43
|
#include <mutex>
|
45
44
|
#include <stdint.h>
|
46
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
47
48
|
#include <string>
|
48
49
|
#include <vector>
|
49
50
|
|
50
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
51
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
52
91
|
[[noreturn]]
|
53
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
54
93
|
int id = -1; // in case cudaGetDevice fails
|
55
94
|
cudaGetDevice(&id);
|
56
95
|
|
57
|
-
|
58
|
-
|
59
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
60
99
|
// abort with GGML_ASSERT to get a stack trace
|
61
100
|
GGML_ASSERT(!"CUDA error");
|
62
101
|
}
|
@@ -92,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
92
131
|
|
93
132
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
94
133
|
if (err != cudaSuccess) {
|
95
|
-
|
134
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
96
135
|
return info;
|
97
136
|
}
|
98
137
|
|
@@ -100,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
100
139
|
|
101
140
|
int64_t total_vram = 0;
|
102
141
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
103
|
-
|
142
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
104
143
|
#else
|
105
|
-
|
144
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
106
145
|
#endif
|
107
146
|
#if defined(CUDA_USE_TENSOR_CORES)
|
108
|
-
|
147
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
109
148
|
#else
|
110
|
-
|
149
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
111
150
|
#endif
|
112
|
-
|
151
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
113
152
|
for (int id = 0; id < info.device_count; ++id) {
|
114
153
|
int device_vmm = 0;
|
115
154
|
|
@@ -130,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
130
169
|
|
131
170
|
cudaDeviceProp prop;
|
132
171
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
133
|
-
|
172
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
134
173
|
|
135
174
|
info.default_tensor_split[id] = total_vram;
|
136
175
|
total_vram += prop.totalGlobalMem;
|
@@ -236,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
236
275
|
*actual_size = look_ahead_size;
|
237
276
|
pool_size += look_ahead_size;
|
238
277
|
#ifdef DEBUG_CUDA_MALLOC
|
239
|
-
|
240
|
-
|
278
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
279
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
241
280
|
#endif
|
242
281
|
return ptr;
|
243
282
|
}
|
@@ -251,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
251
290
|
return;
|
252
291
|
}
|
253
292
|
}
|
254
|
-
|
293
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
255
294
|
ggml_cuda_set_device(device);
|
256
295
|
CUDA_CHECK(cudaFree(ptr));
|
257
296
|
pool_size -= size;
|
@@ -500,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
500
539
|
void * dev_ptr;
|
501
540
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
502
541
|
if (err != cudaSuccess) {
|
503
|
-
|
542
|
+
// clear the error
|
543
|
+
cudaGetLastError();
|
544
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
504
545
|
return nullptr;
|
505
546
|
}
|
506
547
|
|
@@ -1003,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1003
1044
|
if (err != cudaSuccess) {
|
1004
1045
|
// clear the error
|
1005
1046
|
cudaGetLastError();
|
1006
|
-
|
1007
|
-
|
1047
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1048
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1008
1049
|
return nullptr;
|
1009
1050
|
}
|
1010
1051
|
|
@@ -2205,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2205
2246
|
case GGML_UNARY_OP_RELU:
|
2206
2247
|
ggml_cuda_op_relu(ctx, dst);
|
2207
2248
|
break;
|
2249
|
+
case GGML_UNARY_OP_SIGMOID:
|
2250
|
+
ggml_cuda_op_sigmoid(ctx, dst);
|
2251
|
+
break;
|
2208
2252
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2209
2253
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
2210
2254
|
break;
|
@@ -2244,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2244
2288
|
break;
|
2245
2289
|
case GGML_OP_MUL_MAT:
|
2246
2290
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2247
|
-
|
2291
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2248
2292
|
return false;
|
2249
2293
|
} else {
|
2250
2294
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2277,9 +2321,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2277
2321
|
case GGML_OP_ROPE:
|
2278
2322
|
ggml_cuda_op_rope(ctx, dst);
|
2279
2323
|
break;
|
2280
|
-
case GGML_OP_ALIBI:
|
2281
|
-
ggml_cuda_op_alibi(ctx, dst);
|
2282
|
-
break;
|
2283
2324
|
case GGML_OP_IM2COL:
|
2284
2325
|
ggml_cuda_op_im2col(ctx, dst);
|
2285
2326
|
break;
|
@@ -2301,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2301
2342
|
|
2302
2343
|
cudaError_t err = cudaGetLastError();
|
2303
2344
|
if (err != cudaSuccess) {
|
2304
|
-
|
2345
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2305
2346
|
CUDA_CHECK(err);
|
2306
2347
|
}
|
2307
2348
|
|
@@ -2477,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2477
2518
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2478
2519
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2479
2520
|
#ifndef NDEBUG
|
2480
|
-
|
2521
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2481
2522
|
#endif
|
2482
2523
|
}
|
2483
2524
|
}
|
@@ -2524,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2524
2565
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2525
2566
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2526
2567
|
#ifndef NDEBUG
|
2527
|
-
|
2568
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2528
2569
|
#endif
|
2529
2570
|
}
|
2530
2571
|
|
2531
2572
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2532
2573
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2533
2574
|
#ifndef NDEBUG
|
2534
|
-
|
2575
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2535
2576
|
#endif
|
2536
2577
|
}
|
2537
2578
|
|
@@ -2540,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2540
2581
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2541
2582
|
use_cuda_graph = false;
|
2542
2583
|
#ifndef NDEBUG
|
2543
|
-
|
2584
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2544
2585
|
#endif
|
2545
2586
|
}
|
2546
2587
|
|
@@ -2559,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2559
2600
|
}
|
2560
2601
|
|
2561
2602
|
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
2562
|
-
if (cuda_graph_update_required) {
|
2603
|
+
if (use_cuda_graph && cuda_graph_update_required) {
|
2563
2604
|
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
2564
2605
|
} else {
|
2565
2606
|
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
@@ -2568,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2568
2609
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2569
2610
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2570
2611
|
#ifndef NDEBUG
|
2571
|
-
|
2612
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2572
2613
|
#endif
|
2573
2614
|
}
|
2574
2615
|
}
|
@@ -2606,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2606
2647
|
|
2607
2648
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2608
2649
|
if (!ok) {
|
2609
|
-
|
2650
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2610
2651
|
}
|
2611
2652
|
GGML_ASSERT(ok);
|
2612
2653
|
}
|
@@ -2625,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2625
2666
|
use_cuda_graph = false;
|
2626
2667
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2627
2668
|
#ifndef NDEBUG
|
2628
|
-
|
2669
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2629
2670
|
#endif
|
2630
2671
|
} else {
|
2631
2672
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2692,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2692
2733
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2693
2734
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2694
2735
|
#ifndef NDEBUG
|
2695
|
-
|
2736
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2696
2737
|
#endif
|
2697
2738
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2698
2739
|
// so instead clear error and re-instantiate
|
@@ -2714,12 +2755,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2714
2755
|
}
|
2715
2756
|
|
2716
2757
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2758
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
2717
2759
|
switch (op->op) {
|
2718
2760
|
case GGML_OP_UNARY:
|
2719
2761
|
switch (ggml_get_unary_op(op)) {
|
2720
2762
|
case GGML_UNARY_OP_GELU:
|
2721
2763
|
case GGML_UNARY_OP_SILU:
|
2722
2764
|
case GGML_UNARY_OP_RELU:
|
2765
|
+
case GGML_UNARY_OP_SIGMOID:
|
2723
2766
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2724
2767
|
case GGML_UNARY_OP_HARDSWISH:
|
2725
2768
|
case GGML_UNARY_OP_GELU_QUICK:
|
@@ -2829,7 +2872,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2829
2872
|
case GGML_OP_DIAG_MASK_INF:
|
2830
2873
|
case GGML_OP_SOFT_MAX:
|
2831
2874
|
case GGML_OP_ROPE:
|
2832
|
-
case GGML_OP_ALIBI:
|
2833
2875
|
case GGML_OP_IM2COL:
|
2834
2876
|
case GGML_OP_POOL_2D:
|
2835
2877
|
case GGML_OP_SUM_ROWS:
|
@@ -2841,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2841
2883
|
case GGML_OP_ARANGE:
|
2842
2884
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2843
2885
|
case GGML_OP_LEAKY_RELU:
|
2844
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
2845
2886
|
return true;
|
2887
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
2888
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2889
|
+
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
2890
|
+
#else
|
2891
|
+
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
|
2892
|
+
return true;
|
2893
|
+
}
|
2894
|
+
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
|
2895
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2846
2896
|
default:
|
2847
2897
|
return false;
|
2848
2898
|
}
|
@@ -2940,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2940
2990
|
|
2941
2991
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2942
2992
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2943
|
-
|
2993
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2944
2994
|
return nullptr;
|
2945
2995
|
}
|
2946
2996
|
|
2947
2997
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2948
2998
|
if (ctx == nullptr) {
|
2949
|
-
|
2999
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2950
3000
|
return nullptr;
|
2951
3001
|
}
|
2952
3002
|
|
@@ -2990,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2990
3040
|
// clear the error
|
2991
3041
|
cudaGetLastError();
|
2992
3042
|
|
2993
|
-
|
2994
|
-
|
3043
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3044
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
2995
3045
|
return false;
|
2996
3046
|
}
|
2997
3047
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -120,9 +132,16 @@ extern "C" {
|
|
120
132
|
#ifndef __F16C__
|
121
133
|
#define __F16C__
|
122
134
|
#endif
|
135
|
+
#endif
|
136
|
+
|
137
|
+
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
138
|
+
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
123
139
|
#ifndef __SSE3__
|
124
140
|
#define __SSE3__
|
125
141
|
#endif
|
142
|
+
#ifndef __SSSE3__
|
143
|
+
#define __SSSE3__
|
144
|
+
#endif
|
126
145
|
#endif
|
127
146
|
|
128
147
|
// 16-bit float
|
@@ -436,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
436
455
|
#include <riscv_vector.h>
|
437
456
|
#endif
|
438
457
|
|
458
|
+
#if defined(__loongarch64)
|
459
|
+
#if defined(__loongarch_asx)
|
460
|
+
#include <lasxintrin.h>
|
461
|
+
#endif
|
462
|
+
#if defined(__loongarch_sx)
|
463
|
+
#include <lsxintrin.h>
|
464
|
+
#endif
|
465
|
+
#endif
|
466
|
+
|
467
|
+
#if defined(__loongarch_asx)
|
468
|
+
|
469
|
+
typedef union {
|
470
|
+
int32_t i;
|
471
|
+
float f;
|
472
|
+
} ft_union;
|
473
|
+
|
474
|
+
/* float type data load instructions */
|
475
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
476
|
+
ft_union fi_tmpval = {.f = val};
|
477
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
478
|
+
}
|
479
|
+
|
480
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
481
|
+
ft_union fi_tmpval = {.f = val};
|
482
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
483
|
+
}
|
484
|
+
#endif
|
485
|
+
|
439
486
|
#ifdef __F16C__
|
440
487
|
|
441
488
|
#ifdef _MSC_VER
|
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1559
1559
|
case GGML_OP_SOFT_MAX:
|
1560
1560
|
{
|
1561
1561
|
float scale;
|
1562
|
-
|
1562
|
+
float max_bias;
|
1563
1563
|
|
1564
|
-
|
1564
|
+
memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
|
1565
|
+
memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
|
1566
|
+
|
1567
|
+
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
|
1565
1568
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
1566
1569
|
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
1567
|
-
|
1570
|
+
|
1571
|
+
#pragma message("TODO: add ALiBi support")
|
1572
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
1573
|
+
GGML_ASSERT(max_bias == 0.0f);
|
1568
1574
|
|
1569
1575
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
1570
1576
|
} break;
|
@@ -1671,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1671
1677
|
} break;
|
1672
1678
|
case GGML_OP_ROPE:
|
1673
1679
|
{
|
1680
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1681
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1682
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1683
|
+
|
1674
1684
|
GGML_ASSERT(ne10 == ne02);
|
1675
1685
|
GGML_ASSERT(src0t == dstt);
|
1676
1686
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|