llama_cpp 0.15.2 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
@@ -43,19 +43,59 @@
|
|
43
43
|
#include <mutex>
|
44
44
|
#include <stdint.h>
|
45
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
46
48
|
#include <string>
|
47
49
|
#include <vector>
|
48
50
|
|
49
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
50
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
51
91
|
[[noreturn]]
|
52
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
53
93
|
int id = -1; // in case cudaGetDevice fails
|
54
94
|
cudaGetDevice(&id);
|
55
95
|
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
59
99
|
// abort with GGML_ASSERT to get a stack trace
|
60
100
|
GGML_ASSERT(!"CUDA error");
|
61
101
|
}
|
@@ -79,6 +119,20 @@ int ggml_cuda_get_device() {
|
|
79
119
|
return id;
|
80
120
|
}
|
81
121
|
|
122
|
+
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
123
|
+
ggml_cuda_set_device(device);
|
124
|
+
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
125
|
+
auto res = hipMallocManaged(ptr, size);
|
126
|
+
if (res == hipSuccess) {
|
127
|
+
// if error we "need" to know why...
|
128
|
+
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
129
|
+
}
|
130
|
+
return res;
|
131
|
+
#else
|
132
|
+
return cudaMalloc(ptr, size);
|
133
|
+
#endif
|
134
|
+
}
|
135
|
+
|
82
136
|
static ggml_cuda_device_info ggml_cuda_init() {
|
83
137
|
#ifdef __HIP_PLATFORM_AMD__
|
84
138
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
@@ -91,7 +145,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
91
145
|
|
92
146
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
93
147
|
if (err != cudaSuccess) {
|
94
|
-
|
148
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
95
149
|
return info;
|
96
150
|
}
|
97
151
|
|
@@ -99,16 +153,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
99
153
|
|
100
154
|
int64_t total_vram = 0;
|
101
155
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
102
|
-
|
156
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
103
157
|
#else
|
104
|
-
|
158
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
105
159
|
#endif
|
106
160
|
#if defined(CUDA_USE_TENSOR_CORES)
|
107
|
-
|
161
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
108
162
|
#else
|
109
|
-
|
163
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
110
164
|
#endif
|
111
|
-
|
165
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
112
166
|
for (int id = 0; id < info.device_count; ++id) {
|
113
167
|
int device_vmm = 0;
|
114
168
|
|
@@ -129,7 +183,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
129
183
|
|
130
184
|
cudaDeviceProp prop;
|
131
185
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
132
|
-
|
186
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
133
187
|
|
134
188
|
info.default_tensor_split[id] = total_vram;
|
135
189
|
total_vram += prop.totalGlobalMem;
|
@@ -231,12 +285,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
231
285
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
232
286
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
233
287
|
ggml_cuda_set_device(device);
|
234
|
-
CUDA_CHECK(
|
288
|
+
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
235
289
|
*actual_size = look_ahead_size;
|
236
290
|
pool_size += look_ahead_size;
|
237
291
|
#ifdef DEBUG_CUDA_MALLOC
|
238
|
-
|
239
|
-
|
292
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
293
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
240
294
|
#endif
|
241
295
|
return ptr;
|
242
296
|
}
|
@@ -250,7 +304,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
250
304
|
return;
|
251
305
|
}
|
252
306
|
}
|
253
|
-
|
307
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
254
308
|
ggml_cuda_set_device(device);
|
255
309
|
CUDA_CHECK(cudaFree(ptr));
|
256
310
|
pool_size -= size;
|
@@ -497,9 +551,11 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
497
551
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
498
552
|
|
499
553
|
void * dev_ptr;
|
500
|
-
cudaError_t err =
|
554
|
+
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
501
555
|
if (err != cudaSuccess) {
|
502
|
-
|
556
|
+
// clear the error
|
557
|
+
cudaGetLastError();
|
558
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
503
559
|
return nullptr;
|
504
560
|
}
|
505
561
|
|
@@ -756,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|
756
812
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
757
813
|
ggml_cuda_set_device(id);
|
758
814
|
char * buf;
|
759
|
-
CUDA_CHECK(
|
815
|
+
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
760
816
|
|
761
817
|
// set padding to 0 to avoid possible NaN values
|
762
818
|
if (size > original_size) {
|
@@ -1002,8 +1058,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1002
1058
|
if (err != cudaSuccess) {
|
1003
1059
|
// clear the error
|
1004
1060
|
cudaGetLastError();
|
1005
|
-
|
1006
|
-
|
1061
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1062
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1007
1063
|
return nullptr;
|
1008
1064
|
}
|
1009
1065
|
|
@@ -1814,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
1814
1870
|
}
|
1815
1871
|
}
|
1816
1872
|
#else
|
1817
|
-
if (r2 == 1 && r3 == 1 && src0
|
1873
|
+
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
1818
1874
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
1819
1875
|
// use cublasGemmStridedBatchedEx
|
1820
1876
|
CUBLAS_CHECK(
|
@@ -2246,7 +2302,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2246
2302
|
break;
|
2247
2303
|
case GGML_OP_MUL_MAT:
|
2248
2304
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2249
|
-
|
2305
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2250
2306
|
return false;
|
2251
2307
|
} else {
|
2252
2308
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2300,7 +2356,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2300
2356
|
|
2301
2357
|
cudaError_t err = cudaGetLastError();
|
2302
2358
|
if (err != cudaSuccess) {
|
2303
|
-
|
2359
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2304
2360
|
CUDA_CHECK(err);
|
2305
2361
|
}
|
2306
2362
|
|
@@ -2468,15 +2524,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2468
2524
|
|
2469
2525
|
bool use_cuda_graph = true;
|
2470
2526
|
bool cuda_graph_update_required = false;
|
2471
|
-
//
|
2527
|
+
// vector of pointers to CUDA cpy kernels, which are required to identify
|
2472
2528
|
// kernel parameters which need updated in the graph for each token
|
2473
|
-
void
|
2529
|
+
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
2474
2530
|
|
2475
2531
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
2476
2532
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2477
2533
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2478
2534
|
#ifndef NDEBUG
|
2479
|
-
|
2535
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2480
2536
|
#endif
|
2481
2537
|
}
|
2482
2538
|
}
|
@@ -2523,14 +2579,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2523
2579
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2524
2580
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2525
2581
|
#ifndef NDEBUG
|
2526
|
-
|
2582
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2527
2583
|
#endif
|
2528
2584
|
}
|
2529
2585
|
|
2530
2586
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2531
2587
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2532
2588
|
#ifndef NDEBUG
|
2533
|
-
|
2589
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2534
2590
|
#endif
|
2535
2591
|
}
|
2536
2592
|
|
@@ -2539,16 +2595,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2539
2595
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2540
2596
|
use_cuda_graph = false;
|
2541
2597
|
#ifndef NDEBUG
|
2542
|
-
|
2598
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2543
2599
|
#endif
|
2544
2600
|
}
|
2545
2601
|
|
2546
2602
|
if (node->op == GGML_OP_CPY) {
|
2547
2603
|
// store the copy op parameter which changes with each token.
|
2548
2604
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
2549
|
-
|
2550
|
-
|
2551
|
-
|
2605
|
+
// store a pointer to each copy op CUDA kernel to identify it later
|
2606
|
+
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
2607
|
+
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
2608
|
+
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
2552
2609
|
}
|
2553
2610
|
}
|
2554
2611
|
|
@@ -2567,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2567
2624
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2568
2625
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2569
2626
|
#ifndef NDEBUG
|
2570
|
-
|
2627
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2571
2628
|
#endif
|
2572
2629
|
}
|
2573
2630
|
}
|
@@ -2605,7 +2662,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2605
2662
|
|
2606
2663
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2607
2664
|
if (!ok) {
|
2608
|
-
|
2665
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2609
2666
|
}
|
2610
2667
|
GGML_ASSERT(ok);
|
2611
2668
|
}
|
@@ -2624,7 +2681,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2624
2681
|
use_cuda_graph = false;
|
2625
2682
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2626
2683
|
#ifndef NDEBUG
|
2627
|
-
|
2684
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2628
2685
|
#endif
|
2629
2686
|
} else {
|
2630
2687
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2678,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2678
2735
|
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
2679
2736
|
int k = 0;
|
2680
2737
|
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2681
|
-
if (cuda_ctx->cuda_graph->params[i].func
|
2738
|
+
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
2682
2739
|
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
2683
2740
|
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
2684
2741
|
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
@@ -2691,7 +2748,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2691
2748
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2692
2749
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2693
2750
|
#ifndef NDEBUG
|
2694
|
-
|
2751
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2695
2752
|
#endif
|
2696
2753
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2697
2754
|
// so instead clear error and re-instantiate
|
@@ -2829,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2829
2886
|
case GGML_OP_CONT:
|
2830
2887
|
case GGML_OP_DIAG_MASK_INF:
|
2831
2888
|
case GGML_OP_SOFT_MAX:
|
2889
|
+
return true;
|
2832
2890
|
case GGML_OP_ROPE:
|
2891
|
+
return ggml_is_contiguous(op->src[0]);
|
2833
2892
|
case GGML_OP_IM2COL:
|
2834
2893
|
case GGML_OP_POOL_2D:
|
2835
2894
|
case GGML_OP_SUM_ROWS:
|
@@ -2948,13 +3007,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2948
3007
|
|
2949
3008
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2950
3009
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2951
|
-
|
3010
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2952
3011
|
return nullptr;
|
2953
3012
|
}
|
2954
3013
|
|
2955
3014
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2956
3015
|
if (ctx == nullptr) {
|
2957
|
-
|
3016
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2958
3017
|
return nullptr;
|
2959
3018
|
}
|
2960
3019
|
|
@@ -2998,8 +3057,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2998
3057
|
// clear the error
|
2999
3058
|
cudaGetLastError();
|
3000
3059
|
|
3001
|
-
|
3002
|
-
|
3060
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3061
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
3003
3062
|
return false;
|
3004
3063
|
}
|
3005
3064
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -132,6 +144,10 @@ extern "C" {
|
|
132
144
|
#endif
|
133
145
|
#endif
|
134
146
|
|
147
|
+
#if defined(__ARM_FEATURE_SVE)
|
148
|
+
#include <arm_sve.h>
|
149
|
+
#endif
|
150
|
+
|
135
151
|
// 16-bit float
|
136
152
|
// on Arm, we use __fp16
|
137
153
|
// on x86, we use uint16_t
|
@@ -443,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
443
459
|
#include <riscv_vector.h>
|
444
460
|
#endif
|
445
461
|
|
462
|
+
#if defined(__loongarch64)
|
463
|
+
#if defined(__loongarch_asx)
|
464
|
+
#include <lasxintrin.h>
|
465
|
+
#endif
|
466
|
+
#if defined(__loongarch_sx)
|
467
|
+
#include <lsxintrin.h>
|
468
|
+
#endif
|
469
|
+
#endif
|
470
|
+
|
471
|
+
#if defined(__loongarch_asx)
|
472
|
+
|
473
|
+
typedef union {
|
474
|
+
int32_t i;
|
475
|
+
float f;
|
476
|
+
} ft_union;
|
477
|
+
|
478
|
+
/* float type data load instructions */
|
479
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
480
|
+
ft_union fi_tmpval = {.f = val};
|
481
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
482
|
+
}
|
483
|
+
|
484
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
485
|
+
ft_union fi_tmpval = {.f = val};
|
486
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
487
|
+
}
|
488
|
+
#endif
|
489
|
+
|
446
490
|
#ifdef __F16C__
|
447
491
|
|
448
492
|
#ifdef _MSC_VER
|
@@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1597
1597
|
{
|
1598
1598
|
GGML_ASSERT(ne00 == ne10);
|
1599
1599
|
|
1600
|
-
// TODO: assert that dim2 and dim3 are contiguous
|
1601
1600
|
GGML_ASSERT(ne12 % ne02 == 0);
|
1602
1601
|
GGML_ASSERT(ne13 % ne03 == 0);
|
1603
1602
|
|
@@ -1677,6 +1676,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1677
1676
|
} break;
|
1678
1677
|
case GGML_OP_ROPE:
|
1679
1678
|
{
|
1679
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1680
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1681
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1682
|
+
|
1680
1683
|
GGML_ASSERT(ne10 == ne02);
|
1681
1684
|
GGML_ASSERT(src0t == dstt);
|
1682
1685
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|