llama_cpp 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
@@ -43,19 +43,59 @@
|
|
43
43
|
#include <mutex>
|
44
44
|
#include <stdint.h>
|
45
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
46
48
|
#include <string>
|
47
49
|
#include <vector>
|
48
50
|
|
49
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
50
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
51
91
|
[[noreturn]]
|
52
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
53
93
|
int id = -1; // in case cudaGetDevice fails
|
54
94
|
cudaGetDevice(&id);
|
55
95
|
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
59
99
|
// abort with GGML_ASSERT to get a stack trace
|
60
100
|
GGML_ASSERT(!"CUDA error");
|
61
101
|
}
|
@@ -79,6 +119,20 @@ int ggml_cuda_get_device() {
|
|
79
119
|
return id;
|
80
120
|
}
|
81
121
|
|
122
|
+
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
123
|
+
ggml_cuda_set_device(device);
|
124
|
+
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
125
|
+
auto res = hipMallocManaged(ptr, size);
|
126
|
+
if (res == hipSuccess) {
|
127
|
+
// if error we "need" to know why...
|
128
|
+
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
129
|
+
}
|
130
|
+
return res;
|
131
|
+
#else
|
132
|
+
return cudaMalloc(ptr, size);
|
133
|
+
#endif
|
134
|
+
}
|
135
|
+
|
82
136
|
static ggml_cuda_device_info ggml_cuda_init() {
|
83
137
|
#ifdef __HIP_PLATFORM_AMD__
|
84
138
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
@@ -91,7 +145,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
91
145
|
|
92
146
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
93
147
|
if (err != cudaSuccess) {
|
94
|
-
|
148
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
95
149
|
return info;
|
96
150
|
}
|
97
151
|
|
@@ -99,16 +153,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
99
153
|
|
100
154
|
int64_t total_vram = 0;
|
101
155
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
102
|
-
|
156
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
103
157
|
#else
|
104
|
-
|
158
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
105
159
|
#endif
|
106
160
|
#if defined(CUDA_USE_TENSOR_CORES)
|
107
|
-
|
161
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
108
162
|
#else
|
109
|
-
|
163
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
110
164
|
#endif
|
111
|
-
|
165
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
112
166
|
for (int id = 0; id < info.device_count; ++id) {
|
113
167
|
int device_vmm = 0;
|
114
168
|
|
@@ -129,7 +183,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
129
183
|
|
130
184
|
cudaDeviceProp prop;
|
131
185
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
132
|
-
|
186
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
133
187
|
|
134
188
|
info.default_tensor_split[id] = total_vram;
|
135
189
|
total_vram += prop.totalGlobalMem;
|
@@ -231,12 +285,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
231
285
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
232
286
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
233
287
|
ggml_cuda_set_device(device);
|
234
|
-
CUDA_CHECK(
|
288
|
+
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
235
289
|
*actual_size = look_ahead_size;
|
236
290
|
pool_size += look_ahead_size;
|
237
291
|
#ifdef DEBUG_CUDA_MALLOC
|
238
|
-
|
239
|
-
|
292
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
293
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
240
294
|
#endif
|
241
295
|
return ptr;
|
242
296
|
}
|
@@ -250,7 +304,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
250
304
|
return;
|
251
305
|
}
|
252
306
|
}
|
253
|
-
|
307
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
254
308
|
ggml_cuda_set_device(device);
|
255
309
|
CUDA_CHECK(cudaFree(ptr));
|
256
310
|
pool_size -= size;
|
@@ -497,9 +551,11 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
497
551
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
498
552
|
|
499
553
|
void * dev_ptr;
|
500
|
-
cudaError_t err =
|
554
|
+
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
501
555
|
if (err != cudaSuccess) {
|
502
|
-
|
556
|
+
// clear the error
|
557
|
+
cudaGetLastError();
|
558
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
503
559
|
return nullptr;
|
504
560
|
}
|
505
561
|
|
@@ -756,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|
756
812
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
757
813
|
ggml_cuda_set_device(id);
|
758
814
|
char * buf;
|
759
|
-
CUDA_CHECK(
|
815
|
+
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
760
816
|
|
761
817
|
// set padding to 0 to avoid possible NaN values
|
762
818
|
if (size > original_size) {
|
@@ -1002,8 +1058,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1002
1058
|
if (err != cudaSuccess) {
|
1003
1059
|
// clear the error
|
1004
1060
|
cudaGetLastError();
|
1005
|
-
|
1006
|
-
|
1061
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1062
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1007
1063
|
return nullptr;
|
1008
1064
|
}
|
1009
1065
|
|
@@ -1814,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
1814
1870
|
}
|
1815
1871
|
}
|
1816
1872
|
#else
|
1817
|
-
if (r2 == 1 && r3 == 1 && src0
|
1873
|
+
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
1818
1874
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
1819
1875
|
// use cublasGemmStridedBatchedEx
|
1820
1876
|
CUBLAS_CHECK(
|
@@ -2246,7 +2302,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2246
2302
|
break;
|
2247
2303
|
case GGML_OP_MUL_MAT:
|
2248
2304
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2249
|
-
|
2305
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2250
2306
|
return false;
|
2251
2307
|
} else {
|
2252
2308
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2300,7 +2356,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2300
2356
|
|
2301
2357
|
cudaError_t err = cudaGetLastError();
|
2302
2358
|
if (err != cudaSuccess) {
|
2303
|
-
|
2359
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2304
2360
|
CUDA_CHECK(err);
|
2305
2361
|
}
|
2306
2362
|
|
@@ -2468,15 +2524,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2468
2524
|
|
2469
2525
|
bool use_cuda_graph = true;
|
2470
2526
|
bool cuda_graph_update_required = false;
|
2471
|
-
//
|
2527
|
+
// vector of pointers to CUDA cpy kernels, which are required to identify
|
2472
2528
|
// kernel parameters which need updated in the graph for each token
|
2473
|
-
void
|
2529
|
+
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
2474
2530
|
|
2475
2531
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
2476
2532
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2477
2533
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2478
2534
|
#ifndef NDEBUG
|
2479
|
-
|
2535
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2480
2536
|
#endif
|
2481
2537
|
}
|
2482
2538
|
}
|
@@ -2523,14 +2579,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2523
2579
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2524
2580
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2525
2581
|
#ifndef NDEBUG
|
2526
|
-
|
2582
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2527
2583
|
#endif
|
2528
2584
|
}
|
2529
2585
|
|
2530
2586
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2531
2587
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2532
2588
|
#ifndef NDEBUG
|
2533
|
-
|
2589
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2534
2590
|
#endif
|
2535
2591
|
}
|
2536
2592
|
|
@@ -2539,16 +2595,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2539
2595
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2540
2596
|
use_cuda_graph = false;
|
2541
2597
|
#ifndef NDEBUG
|
2542
|
-
|
2598
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2543
2599
|
#endif
|
2544
2600
|
}
|
2545
2601
|
|
2546
2602
|
if (node->op == GGML_OP_CPY) {
|
2547
2603
|
// store the copy op parameter which changes with each token.
|
2548
2604
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
2549
|
-
|
2550
|
-
|
2551
|
-
|
2605
|
+
// store a pointer to each copy op CUDA kernel to identify it later
|
2606
|
+
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
2607
|
+
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
2608
|
+
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
2552
2609
|
}
|
2553
2610
|
}
|
2554
2611
|
|
@@ -2567,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2567
2624
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2568
2625
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2569
2626
|
#ifndef NDEBUG
|
2570
|
-
|
2627
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2571
2628
|
#endif
|
2572
2629
|
}
|
2573
2630
|
}
|
@@ -2605,7 +2662,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2605
2662
|
|
2606
2663
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2607
2664
|
if (!ok) {
|
2608
|
-
|
2665
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2609
2666
|
}
|
2610
2667
|
GGML_ASSERT(ok);
|
2611
2668
|
}
|
@@ -2624,7 +2681,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2624
2681
|
use_cuda_graph = false;
|
2625
2682
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2626
2683
|
#ifndef NDEBUG
|
2627
|
-
|
2684
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2628
2685
|
#endif
|
2629
2686
|
} else {
|
2630
2687
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2678,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2678
2735
|
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
2679
2736
|
int k = 0;
|
2680
2737
|
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2681
|
-
if (cuda_ctx->cuda_graph->params[i].func
|
2738
|
+
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
2682
2739
|
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
2683
2740
|
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
2684
2741
|
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
@@ -2691,7 +2748,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2691
2748
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2692
2749
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2693
2750
|
#ifndef NDEBUG
|
2694
|
-
|
2751
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2695
2752
|
#endif
|
2696
2753
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2697
2754
|
// so instead clear error and re-instantiate
|
@@ -2829,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2829
2886
|
case GGML_OP_CONT:
|
2830
2887
|
case GGML_OP_DIAG_MASK_INF:
|
2831
2888
|
case GGML_OP_SOFT_MAX:
|
2889
|
+
return true;
|
2832
2890
|
case GGML_OP_ROPE:
|
2891
|
+
return ggml_is_contiguous(op->src[0]);
|
2833
2892
|
case GGML_OP_IM2COL:
|
2834
2893
|
case GGML_OP_POOL_2D:
|
2835
2894
|
case GGML_OP_SUM_ROWS:
|
@@ -2948,13 +3007,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2948
3007
|
|
2949
3008
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2950
3009
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2951
|
-
|
3010
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2952
3011
|
return nullptr;
|
2953
3012
|
}
|
2954
3013
|
|
2955
3014
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2956
3015
|
if (ctx == nullptr) {
|
2957
|
-
|
3016
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2958
3017
|
return nullptr;
|
2959
3018
|
}
|
2960
3019
|
|
@@ -2998,8 +3057,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2998
3057
|
// clear the error
|
2999
3058
|
cudaGetLastError();
|
3000
3059
|
|
3001
|
-
|
3002
|
-
|
3060
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3061
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
3003
3062
|
return false;
|
3004
3063
|
}
|
3005
3064
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -132,6 +144,10 @@ extern "C" {
|
|
132
144
|
#endif
|
133
145
|
#endif
|
134
146
|
|
147
|
+
#if defined(__ARM_FEATURE_SVE)
|
148
|
+
#include <arm_sve.h>
|
149
|
+
#endif
|
150
|
+
|
135
151
|
// 16-bit float
|
136
152
|
// on Arm, we use __fp16
|
137
153
|
// on x86, we use uint16_t
|
@@ -443,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
443
459
|
#include <riscv_vector.h>
|
444
460
|
#endif
|
445
461
|
|
462
|
+
#if defined(__loongarch64)
|
463
|
+
#if defined(__loongarch_asx)
|
464
|
+
#include <lasxintrin.h>
|
465
|
+
#endif
|
466
|
+
#if defined(__loongarch_sx)
|
467
|
+
#include <lsxintrin.h>
|
468
|
+
#endif
|
469
|
+
#endif
|
470
|
+
|
471
|
+
#if defined(__loongarch_asx)
|
472
|
+
|
473
|
+
typedef union {
|
474
|
+
int32_t i;
|
475
|
+
float f;
|
476
|
+
} ft_union;
|
477
|
+
|
478
|
+
/* float type data load instructions */
|
479
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
480
|
+
ft_union fi_tmpval = {.f = val};
|
481
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
482
|
+
}
|
483
|
+
|
484
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
485
|
+
ft_union fi_tmpval = {.f = val};
|
486
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
487
|
+
}
|
488
|
+
#endif
|
489
|
+
|
446
490
|
#ifdef __F16C__
|
447
491
|
|
448
492
|
#ifdef _MSC_VER
|
@@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1597
1597
|
{
|
1598
1598
|
GGML_ASSERT(ne00 == ne10);
|
1599
1599
|
|
1600
|
-
// TODO: assert that dim2 and dim3 are contiguous
|
1601
1600
|
GGML_ASSERT(ne12 % ne02 == 0);
|
1602
1601
|
GGML_ASSERT(ne13 % ne03 == 0);
|
1603
1602
|
|
@@ -1677,6 +1676,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1677
1676
|
} break;
|
1678
1677
|
case GGML_OP_ROPE:
|
1679
1678
|
{
|
1679
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1680
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1681
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1682
|
+
|
1680
1683
|
GGML_ASSERT(ne10 == ne02);
|
1681
1684
|
GGML_ASSERT(src0t == dstt);
|
1682
1685
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|