llama_cpp 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
|
4
|
+
data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
|
7
|
+
data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b1140 to master-b1198.
|
4
|
+
|
1
5
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
6
|
|
3
7
|
**Breaking Changes**
|
@@ -1,3 +1,8 @@
|
|
1
|
+
// defines MAP_ANONYMOUS
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
1
6
|
#include "ggml-alloc.h"
|
2
7
|
#include "ggml.h"
|
3
8
|
#include <assert.h>
|
@@ -6,6 +11,26 @@
|
|
6
11
|
#include <stdlib.h>
|
7
12
|
#include <string.h>
|
8
13
|
|
14
|
+
#ifdef __has_include
|
15
|
+
#if __has_include(<unistd.h>)
|
16
|
+
#include <unistd.h>
|
17
|
+
#if defined(_POSIX_MAPPED_FILES)
|
18
|
+
#include <sys/types.h>
|
19
|
+
#include <sys/mman.h>
|
20
|
+
#endif
|
21
|
+
#endif
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
#define WIN32_LEAN_AND_MEAN
|
26
|
+
#ifndef NOMINMAX
|
27
|
+
#define NOMINMAX
|
28
|
+
#endif
|
29
|
+
#include <windows.h>
|
30
|
+
#include <memoryapi.h>
|
31
|
+
#endif
|
32
|
+
|
33
|
+
|
9
34
|
#define UNUSED(x) (void)(x)
|
10
35
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
36
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
124
|
}
|
100
125
|
#endif
|
101
126
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
127
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
128
|
return ggml_nbytes(tensor);
|
105
129
|
|
106
130
|
UNUSED(alloc);
|
107
131
|
}
|
108
132
|
|
133
|
+
// check if a tensor is allocated by this buffer
|
134
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
135
|
+
void * ptr = tensor->data;
|
136
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
|
+
}
|
138
|
+
|
109
139
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
140
|
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
-
GGML_ASSERT(ggml_is_view(tensor)
|
141
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
112
142
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
143
|
#endif
|
114
|
-
size_t size =
|
144
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
115
145
|
size = aligned_offset(NULL, size, alloc->alignment);
|
116
146
|
|
117
147
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -135,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
135
165
|
if (best_fit_block == -1) {
|
136
166
|
// the last block is our last resort
|
137
167
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
168
|
+
max_avail = MAX(max_avail, block->size);
|
138
169
|
if (block->size >= size) {
|
139
170
|
best_fit_block = alloc->n_free_blocks - 1;
|
140
|
-
max_avail = MAX(max_avail, block->size);
|
141
171
|
} else {
|
142
172
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
143
173
|
__func__, size, max_avail);
|
144
174
|
GGML_ASSERT(!"not enough space in the buffer");
|
145
|
-
|
175
|
+
return;
|
146
176
|
}
|
147
177
|
}
|
148
178
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
177
207
|
}
|
178
208
|
|
179
209
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
180
|
-
static void
|
210
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
181
211
|
void * ptr = tensor->data;
|
182
212
|
|
183
|
-
if (
|
213
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
184
214
|
// the tensor was not allocated in this buffer
|
185
215
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
186
216
|
// the easiest way to deal with this is just to ignore it
|
187
217
|
return;
|
188
218
|
}
|
189
219
|
|
190
|
-
size_t size =
|
220
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
191
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
192
222
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
193
223
|
|
@@ -281,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
281
311
|
return alloc;
|
282
312
|
}
|
283
313
|
|
284
|
-
//
|
285
|
-
|
286
|
-
|
287
|
-
|
314
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
315
|
+
static void * alloc_vmem(size_t size) {
|
316
|
+
#if defined(_WIN32)
|
317
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
318
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
319
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
320
|
+
if (ptr == MAP_FAILED) {
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
return ptr;
|
324
|
+
#else
|
325
|
+
// use a fixed address for other platforms
|
326
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
327
|
+
return (void *)base_addr;
|
328
|
+
#endif
|
329
|
+
}
|
330
|
+
|
331
|
+
static void free_vmem(void * base_addr, size_t size) {
|
332
|
+
#if defined(_WIN32)
|
333
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
334
|
+
UNUSED(size);
|
335
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
336
|
+
munmap(base_addr, size);
|
337
|
+
#else
|
338
|
+
// nothing to do
|
339
|
+
UNUSED(base_addr);
|
340
|
+
UNUSED(size);
|
341
|
+
#endif
|
342
|
+
}
|
343
|
+
|
344
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
345
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
347
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
348
|
+
do {
|
349
|
+
*base_addr = alloc_vmem(*size);
|
350
|
+
if (*base_addr != NULL) {
|
351
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
352
|
+
return;
|
353
|
+
}
|
354
|
+
// try again with half the size
|
355
|
+
*size /= 2;
|
356
|
+
} while (*size > 0);
|
357
|
+
|
358
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
359
|
+
}
|
360
|
+
|
361
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
362
|
+
free_vmem(base_addr, size);
|
363
|
+
}
|
288
364
|
|
289
365
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
290
366
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
291
367
|
|
368
|
+
void * base_addr;
|
369
|
+
size_t size;
|
370
|
+
|
371
|
+
alloc_measure_vmem(&base_addr, &size);
|
372
|
+
|
292
373
|
*alloc = (struct ggml_allocr){
|
293
|
-
/*.data = */
|
294
|
-
/*.size = */
|
374
|
+
/*.data = */ base_addr,
|
375
|
+
/*.size = */ size,
|
295
376
|
/*.alignment = */ alignment,
|
296
377
|
/*.n_free_blocks = */ 0,
|
297
378
|
/*.free_blocks = */ {{0}},
|
@@ -311,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
311
392
|
}
|
312
393
|
|
313
394
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
395
|
+
if (alloc->measure) {
|
396
|
+
free_measure_vmem(alloc->data, alloc->size);
|
397
|
+
}
|
314
398
|
free(alloc);
|
315
399
|
}
|
316
400
|
|
@@ -380,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
380
464
|
}
|
381
465
|
|
382
466
|
// if the node's data is external, then we cannot re-use it
|
383
|
-
if ((
|
384
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
467
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
385
468
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
386
469
|
continue;
|
387
470
|
}
|
@@ -415,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
415
498
|
}
|
416
499
|
}
|
417
500
|
|
418
|
-
static size_t
|
501
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
419
502
|
struct ggml_allocr * alloc,
|
420
503
|
struct ggml_cgraph ** graphs, int n_graphs,
|
421
504
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -493,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
493
576
|
AT_PRINTF("\n");
|
494
577
|
}
|
495
578
|
|
496
|
-
|
497
579
|
// update parents
|
498
580
|
// update immediately if there is no parse_seq
|
499
581
|
// update only at barriers if there is parse_seq
|
500
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
582
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
501
583
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
584
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
585
|
for (int i = update_start; i < update_end; i++) {
|
@@ -521,12 +603,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
521
603
|
view_src_hn->n_views -= 1;
|
522
604
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
605
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
525
607
|
}
|
526
608
|
}
|
527
609
|
else {
|
528
610
|
if (parent->data != node->data) {
|
529
|
-
|
611
|
+
ggml_allocr_free_tensor(alloc, parent);
|
530
612
|
}
|
531
613
|
}
|
532
614
|
}
|
@@ -543,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
543
625
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
544
626
|
struct ggml_tensor * output = outputs[g][i];
|
545
627
|
AT_PRINTF("output: %s\n", output->name);
|
546
|
-
|
628
|
+
ggml_allocr_free_tensor(alloc, output);
|
547
629
|
}
|
548
630
|
}
|
549
631
|
}
|
@@ -552,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
552
634
|
}
|
553
635
|
|
554
636
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
555
|
-
return
|
637
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
556
638
|
}
|
@@ -81,12 +81,29 @@
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#ifndef __has_builtin
|
85
|
+
#define __has_builtin(x) 0
|
86
|
+
#endif
|
87
|
+
|
84
88
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
89
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
90
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
91
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
92
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
93
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
94
|
return reinterpret_cast<const int&>(c);
|
95
|
+
#else
|
96
|
+
int8x4_t c;
|
97
|
+
int16_t tmp;
|
98
|
+
#pragma unroll
|
99
|
+
for (int i = 0; i < 4; i++) {
|
100
|
+
tmp = va[i] - vb[i];
|
101
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
102
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
103
|
+
c[i] = tmp;
|
104
|
+
}
|
105
|
+
return reinterpret_cast<int&>(c);
|
106
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
107
|
}
|
91
108
|
|
92
109
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
464
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
465
|
}
|
449
466
|
|
467
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
468
|
+
#pragma unroll
|
469
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
470
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
471
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
472
|
+
}
|
473
|
+
return a;
|
474
|
+
}
|
475
|
+
|
476
|
+
template <int block_size>
|
450
477
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
478
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
479
|
const int tid = threadIdx.x;
|
453
480
|
|
454
481
|
const float eps = 1e-5f;
|
455
482
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
483
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
484
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
485
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
486
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
487
|
+
mean_var.x += xi;
|
488
|
+
mean_var.y += xi * xi;
|
463
489
|
}
|
464
490
|
|
465
491
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
492
|
+
mean_var = warp_reduce_sum(mean_var);
|
493
|
+
if (block_size > WARP_SIZE) {
|
494
|
+
__shared__ float2 s_sum[32];
|
495
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
496
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
497
|
+
if (lane_id == 0) {
|
498
|
+
s_sum[warp_id] = mean_var;
|
499
|
+
}
|
500
|
+
__syncthreads();
|
501
|
+
mean_var = s_sum[lane_id];
|
502
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
503
|
}
|
471
504
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
505
|
+
const float mean = mean_var.x / ncols;
|
506
|
+
const float var = mean_var.y / ncols - mean * mean;
|
507
|
+
const float inv_std = rsqrtf(var + eps);
|
508
|
+
|
509
|
+
for (int col = tid; col < ncols; col += block_size) {
|
510
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
511
|
+
}
|
512
|
+
}
|
475
513
|
|
476
|
-
|
477
|
-
|
514
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
515
|
+
#pragma unroll
|
516
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
517
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
518
|
}
|
519
|
+
return x;
|
479
520
|
}
|
480
521
|
|
522
|
+
template <int block_size>
|
481
523
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
524
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
525
|
const int tid = threadIdx.x;
|
484
526
|
|
485
527
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
528
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
529
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
530
|
const float xi = x[row*ncols + col];
|
489
531
|
tmp += xi * xi;
|
490
532
|
}
|
491
533
|
|
492
534
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
535
|
+
tmp = warp_reduce_sum(tmp);
|
536
|
+
if (block_size > WARP_SIZE) {
|
537
|
+
__shared__ float s_sum[32];
|
538
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
539
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
540
|
+
if (lane_id == 0) {
|
541
|
+
s_sum[warp_id] = tmp;
|
542
|
+
}
|
543
|
+
__syncthreads();
|
544
|
+
tmp = s_sum[lane_id];
|
545
|
+
tmp = warp_reduce_sum(tmp);
|
496
546
|
}
|
497
547
|
|
498
548
|
const float mean = tmp / ncols;
|
499
549
|
const float scale = rsqrtf(mean + eps);
|
500
550
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
551
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
552
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
553
|
}
|
504
554
|
}
|
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4186
4236
|
|
4187
4237
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4188
4238
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4239
|
+
if (ncols < 1024) {
|
4240
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4241
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4242
|
+
} else {
|
4243
|
+
const dim3 block_dims(1024, 1, 1);
|
4244
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4245
|
+
}
|
4191
4246
|
}
|
4192
4247
|
|
4193
4248
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4194
4249
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4195
|
-
|
4196
|
-
|
4250
|
+
if (ncols < 1024) {
|
4251
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4252
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4253
|
+
} else {
|
4254
|
+
const dim3 block_dims(1024, 1, 1);
|
4255
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4256
|
+
}
|
4197
4257
|
}
|
4198
4258
|
|
4199
4259
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -76,6 +76,7 @@ struct ggml_metal_context {
|
|
76
76
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
77
77
|
GGML_METAL_DECL_KERNEL(norm);
|
78
78
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
79
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
79
80
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
80
81
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
81
82
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
116
117
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
117
118
|
metal_printf("%s: allocating\n", __func__);
|
118
119
|
|
119
|
-
|
120
|
+
// Show all the Metal device instances in the system
|
121
|
+
NSArray * devices = MTLCopyAllDevices();
|
122
|
+
id <MTLDevice> device;
|
123
|
+
NSString * s;
|
124
|
+
for (device in devices) {
|
125
|
+
s = [device name];
|
126
|
+
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
127
|
+
}
|
120
128
|
|
129
|
+
// Pick and show default Metal device
|
130
|
+
device = MTLCreateSystemDefaultDevice();
|
131
|
+
s = [device name];
|
132
|
+
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
133
|
+
|
134
|
+
// Configure context
|
135
|
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
136
|
+
ctx->device = device;
|
121
137
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
122
|
-
ctx->device = MTLCreateSystemDefaultDevice();
|
123
138
|
ctx->queue = [ctx->device newCommandQueue];
|
124
139
|
ctx->n_buffers = 0;
|
125
140
|
ctx->concur_list_len = 0;
|
@@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
205
220
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
206
221
|
GGML_METAL_ADD_KERNEL(norm);
|
207
222
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
223
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
208
224
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
209
225
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
210
226
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
270
286
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
271
287
|
GGML_METAL_DEL_KERNEL(norm);
|
272
288
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
289
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
273
290
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
274
291
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
275
292
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -310,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
310
327
|
|
311
328
|
void * ggml_metal_host_malloc(size_t n) {
|
312
329
|
void * data = NULL;
|
313
|
-
const int result = posix_memalign((void **) &data,
|
330
|
+
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
314
331
|
if (result != 0) {
|
315
332
|
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
316
333
|
return NULL;
|
@@ -384,7 +401,7 @@ bool ggml_metal_add_buffer(
|
|
384
401
|
}
|
385
402
|
}
|
386
403
|
|
387
|
-
const size_t size_page =
|
404
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
388
405
|
|
389
406
|
size_t size_aligned = size;
|
390
407
|
if ((size_aligned % size_page) != 0) {
|
@@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
|
|
854
871
|
{
|
855
872
|
nth0 = 32;
|
856
873
|
nth1 = 1;
|
857
|
-
|
874
|
+
if (ne11 * ne12 < 4) {
|
875
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
876
|
+
} else {
|
877
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
878
|
+
}
|
858
879
|
} break;
|
859
880
|
case GGML_TYPE_Q4_0:
|
860
881
|
{
|
@@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
|
|
906
927
|
GGML_ASSERT(ne02 == 1);
|
907
928
|
GGML_ASSERT(ne12 == 1);
|
908
929
|
|
909
|
-
nth0 =
|
910
|
-
nth1 = 32;
|
930
|
+
nth0 = 4; //1;
|
931
|
+
nth1 = 8; //32;
|
911
932
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
912
933
|
} break;
|
913
934
|
case GGML_TYPE_Q5_K:
|
@@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
|
|
955
976
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
956
977
|
|
957
978
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
958
|
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
979
|
+
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
959
980
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
960
981
|
}
|
982
|
+
else if (src0t == GGML_TYPE_Q4_K) {
|
983
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
984
|
+
}
|
961
985
|
else if (src0t == GGML_TYPE_Q3_K) {
|
962
986
|
#ifdef GGML_QKK_64
|
963
987
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
|
|
971
995
|
else if (src0t == GGML_TYPE_Q6_K) {
|
972
996
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
973
997
|
} else {
|
974
|
-
|
975
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
998
|
+
int64_t ny = (ne11 + 3)/4;
|
999
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
976
1000
|
}
|
977
1001
|
}
|
978
1002
|
} break;
|
@@ -1117,7 +1141,7 @@ void ggml_metal_graph_compute(
|
|
1117
1141
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
1118
1142
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
1119
1143
|
|
1120
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(
|
1144
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
1121
1145
|
} break;
|
1122
1146
|
case GGML_OP_DUP:
|
1123
1147
|
case GGML_OP_CPY:
|