llama_cpp 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
|
4
|
+
data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
|
7
|
+
data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b1140 to master-b1198.
|
4
|
+
|
1
5
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
6
|
|
3
7
|
**Breaking Changes**
|
@@ -1,3 +1,8 @@
|
|
1
|
+
// defines MAP_ANONYMOUS
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
1
6
|
#include "ggml-alloc.h"
|
2
7
|
#include "ggml.h"
|
3
8
|
#include <assert.h>
|
@@ -6,6 +11,26 @@
|
|
6
11
|
#include <stdlib.h>
|
7
12
|
#include <string.h>
|
8
13
|
|
14
|
+
#ifdef __has_include
|
15
|
+
#if __has_include(<unistd.h>)
|
16
|
+
#include <unistd.h>
|
17
|
+
#if defined(_POSIX_MAPPED_FILES)
|
18
|
+
#include <sys/types.h>
|
19
|
+
#include <sys/mman.h>
|
20
|
+
#endif
|
21
|
+
#endif
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
#define WIN32_LEAN_AND_MEAN
|
26
|
+
#ifndef NOMINMAX
|
27
|
+
#define NOMINMAX
|
28
|
+
#endif
|
29
|
+
#include <windows.h>
|
30
|
+
#include <memoryapi.h>
|
31
|
+
#endif
|
32
|
+
|
33
|
+
|
9
34
|
#define UNUSED(x) (void)(x)
|
10
35
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
36
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
124
|
}
|
100
125
|
#endif
|
101
126
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
127
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
128
|
return ggml_nbytes(tensor);
|
105
129
|
|
106
130
|
UNUSED(alloc);
|
107
131
|
}
|
108
132
|
|
133
|
+
// check if a tensor is allocated by this buffer
|
134
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
135
|
+
void * ptr = tensor->data;
|
136
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
|
+
}
|
138
|
+
|
109
139
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
140
|
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
-
GGML_ASSERT(ggml_is_view(tensor)
|
141
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
112
142
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
143
|
#endif
|
114
|
-
size_t size =
|
144
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
115
145
|
size = aligned_offset(NULL, size, alloc->alignment);
|
116
146
|
|
117
147
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -135,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
135
165
|
if (best_fit_block == -1) {
|
136
166
|
// the last block is our last resort
|
137
167
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
168
|
+
max_avail = MAX(max_avail, block->size);
|
138
169
|
if (block->size >= size) {
|
139
170
|
best_fit_block = alloc->n_free_blocks - 1;
|
140
|
-
max_avail = MAX(max_avail, block->size);
|
141
171
|
} else {
|
142
172
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
143
173
|
__func__, size, max_avail);
|
144
174
|
GGML_ASSERT(!"not enough space in the buffer");
|
145
|
-
|
175
|
+
return;
|
146
176
|
}
|
147
177
|
}
|
148
178
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
177
207
|
}
|
178
208
|
|
179
209
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
180
|
-
static void
|
210
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
181
211
|
void * ptr = tensor->data;
|
182
212
|
|
183
|
-
if (
|
213
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
184
214
|
// the tensor was not allocated in this buffer
|
185
215
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
186
216
|
// the easiest way to deal with this is just to ignore it
|
187
217
|
return;
|
188
218
|
}
|
189
219
|
|
190
|
-
size_t size =
|
220
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
191
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
192
222
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
193
223
|
|
@@ -281,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
281
311
|
return alloc;
|
282
312
|
}
|
283
313
|
|
284
|
-
//
|
285
|
-
|
286
|
-
|
287
|
-
|
314
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
315
|
+
static void * alloc_vmem(size_t size) {
|
316
|
+
#if defined(_WIN32)
|
317
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
318
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
319
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
320
|
+
if (ptr == MAP_FAILED) {
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
return ptr;
|
324
|
+
#else
|
325
|
+
// use a fixed address for other platforms
|
326
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
327
|
+
return (void *)base_addr;
|
328
|
+
#endif
|
329
|
+
}
|
330
|
+
|
331
|
+
static void free_vmem(void * base_addr, size_t size) {
|
332
|
+
#if defined(_WIN32)
|
333
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
334
|
+
UNUSED(size);
|
335
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
336
|
+
munmap(base_addr, size);
|
337
|
+
#else
|
338
|
+
// nothing to do
|
339
|
+
UNUSED(base_addr);
|
340
|
+
UNUSED(size);
|
341
|
+
#endif
|
342
|
+
}
|
343
|
+
|
344
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
345
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
347
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
348
|
+
do {
|
349
|
+
*base_addr = alloc_vmem(*size);
|
350
|
+
if (*base_addr != NULL) {
|
351
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
352
|
+
return;
|
353
|
+
}
|
354
|
+
// try again with half the size
|
355
|
+
*size /= 2;
|
356
|
+
} while (*size > 0);
|
357
|
+
|
358
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
359
|
+
}
|
360
|
+
|
361
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
362
|
+
free_vmem(base_addr, size);
|
363
|
+
}
|
288
364
|
|
289
365
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
290
366
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
291
367
|
|
368
|
+
void * base_addr;
|
369
|
+
size_t size;
|
370
|
+
|
371
|
+
alloc_measure_vmem(&base_addr, &size);
|
372
|
+
|
292
373
|
*alloc = (struct ggml_allocr){
|
293
|
-
/*.data = */
|
294
|
-
/*.size = */
|
374
|
+
/*.data = */ base_addr,
|
375
|
+
/*.size = */ size,
|
295
376
|
/*.alignment = */ alignment,
|
296
377
|
/*.n_free_blocks = */ 0,
|
297
378
|
/*.free_blocks = */ {{0}},
|
@@ -311,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
311
392
|
}
|
312
393
|
|
313
394
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
395
|
+
if (alloc->measure) {
|
396
|
+
free_measure_vmem(alloc->data, alloc->size);
|
397
|
+
}
|
314
398
|
free(alloc);
|
315
399
|
}
|
316
400
|
|
@@ -380,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
380
464
|
}
|
381
465
|
|
382
466
|
// if the node's data is external, then we cannot re-use it
|
383
|
-
if ((
|
384
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
467
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
385
468
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
386
469
|
continue;
|
387
470
|
}
|
@@ -415,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
415
498
|
}
|
416
499
|
}
|
417
500
|
|
418
|
-
static size_t
|
501
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
419
502
|
struct ggml_allocr * alloc,
|
420
503
|
struct ggml_cgraph ** graphs, int n_graphs,
|
421
504
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -493,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
493
576
|
AT_PRINTF("\n");
|
494
577
|
}
|
495
578
|
|
496
|
-
|
497
579
|
// update parents
|
498
580
|
// update immediately if there is no parse_seq
|
499
581
|
// update only at barriers if there is parse_seq
|
500
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
582
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
501
583
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
584
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
585
|
for (int i = update_start; i < update_end; i++) {
|
@@ -521,12 +603,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
521
603
|
view_src_hn->n_views -= 1;
|
522
604
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
605
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
525
607
|
}
|
526
608
|
}
|
527
609
|
else {
|
528
610
|
if (parent->data != node->data) {
|
529
|
-
|
611
|
+
ggml_allocr_free_tensor(alloc, parent);
|
530
612
|
}
|
531
613
|
}
|
532
614
|
}
|
@@ -543,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
543
625
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
544
626
|
struct ggml_tensor * output = outputs[g][i];
|
545
627
|
AT_PRINTF("output: %s\n", output->name);
|
546
|
-
|
628
|
+
ggml_allocr_free_tensor(alloc, output);
|
547
629
|
}
|
548
630
|
}
|
549
631
|
}
|
@@ -552,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
552
634
|
}
|
553
635
|
|
554
636
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
555
|
-
return
|
637
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
556
638
|
}
|
@@ -81,12 +81,29 @@
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#ifndef __has_builtin
|
85
|
+
#define __has_builtin(x) 0
|
86
|
+
#endif
|
87
|
+
|
84
88
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
89
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
90
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
91
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
92
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
93
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
94
|
return reinterpret_cast<const int&>(c);
|
95
|
+
#else
|
96
|
+
int8x4_t c;
|
97
|
+
int16_t tmp;
|
98
|
+
#pragma unroll
|
99
|
+
for (int i = 0; i < 4; i++) {
|
100
|
+
tmp = va[i] - vb[i];
|
101
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
102
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
103
|
+
c[i] = tmp;
|
104
|
+
}
|
105
|
+
return reinterpret_cast<int&>(c);
|
106
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
107
|
}
|
91
108
|
|
92
109
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
464
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
465
|
}
|
449
466
|
|
467
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
468
|
+
#pragma unroll
|
469
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
470
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
471
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
472
|
+
}
|
473
|
+
return a;
|
474
|
+
}
|
475
|
+
|
476
|
+
template <int block_size>
|
450
477
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
478
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
479
|
const int tid = threadIdx.x;
|
453
480
|
|
454
481
|
const float eps = 1e-5f;
|
455
482
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
483
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
484
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
485
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
486
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
487
|
+
mean_var.x += xi;
|
488
|
+
mean_var.y += xi * xi;
|
463
489
|
}
|
464
490
|
|
465
491
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
492
|
+
mean_var = warp_reduce_sum(mean_var);
|
493
|
+
if (block_size > WARP_SIZE) {
|
494
|
+
__shared__ float2 s_sum[32];
|
495
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
496
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
497
|
+
if (lane_id == 0) {
|
498
|
+
s_sum[warp_id] = mean_var;
|
499
|
+
}
|
500
|
+
__syncthreads();
|
501
|
+
mean_var = s_sum[lane_id];
|
502
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
503
|
}
|
471
504
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
505
|
+
const float mean = mean_var.x / ncols;
|
506
|
+
const float var = mean_var.y / ncols - mean * mean;
|
507
|
+
const float inv_std = rsqrtf(var + eps);
|
508
|
+
|
509
|
+
for (int col = tid; col < ncols; col += block_size) {
|
510
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
511
|
+
}
|
512
|
+
}
|
475
513
|
|
476
|
-
|
477
|
-
|
514
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
515
|
+
#pragma unroll
|
516
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
517
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
518
|
}
|
519
|
+
return x;
|
479
520
|
}
|
480
521
|
|
522
|
+
template <int block_size>
|
481
523
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
524
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
525
|
const int tid = threadIdx.x;
|
484
526
|
|
485
527
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
528
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
529
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
530
|
const float xi = x[row*ncols + col];
|
489
531
|
tmp += xi * xi;
|
490
532
|
}
|
491
533
|
|
492
534
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
535
|
+
tmp = warp_reduce_sum(tmp);
|
536
|
+
if (block_size > WARP_SIZE) {
|
537
|
+
__shared__ float s_sum[32];
|
538
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
539
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
540
|
+
if (lane_id == 0) {
|
541
|
+
s_sum[warp_id] = tmp;
|
542
|
+
}
|
543
|
+
__syncthreads();
|
544
|
+
tmp = s_sum[lane_id];
|
545
|
+
tmp = warp_reduce_sum(tmp);
|
496
546
|
}
|
497
547
|
|
498
548
|
const float mean = tmp / ncols;
|
499
549
|
const float scale = rsqrtf(mean + eps);
|
500
550
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
551
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
552
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
553
|
}
|
504
554
|
}
|
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4186
4236
|
|
4187
4237
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4188
4238
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4239
|
+
if (ncols < 1024) {
|
4240
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4241
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4242
|
+
} else {
|
4243
|
+
const dim3 block_dims(1024, 1, 1);
|
4244
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4245
|
+
}
|
4191
4246
|
}
|
4192
4247
|
|
4193
4248
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4194
4249
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4195
|
-
|
4196
|
-
|
4250
|
+
if (ncols < 1024) {
|
4251
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4252
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4253
|
+
} else {
|
4254
|
+
const dim3 block_dims(1024, 1, 1);
|
4255
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4256
|
+
}
|
4197
4257
|
}
|
4198
4258
|
|
4199
4259
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -76,6 +76,7 @@ struct ggml_metal_context {
|
|
76
76
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
77
77
|
GGML_METAL_DECL_KERNEL(norm);
|
78
78
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
79
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
79
80
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
80
81
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
81
82
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
116
117
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
117
118
|
metal_printf("%s: allocating\n", __func__);
|
118
119
|
|
119
|
-
|
120
|
+
// Show all the Metal device instances in the system
|
121
|
+
NSArray * devices = MTLCopyAllDevices();
|
122
|
+
id <MTLDevice> device;
|
123
|
+
NSString * s;
|
124
|
+
for (device in devices) {
|
125
|
+
s = [device name];
|
126
|
+
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
127
|
+
}
|
120
128
|
|
129
|
+
// Pick and show default Metal device
|
130
|
+
device = MTLCreateSystemDefaultDevice();
|
131
|
+
s = [device name];
|
132
|
+
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
133
|
+
|
134
|
+
// Configure context
|
135
|
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
136
|
+
ctx->device = device;
|
121
137
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
122
|
-
ctx->device = MTLCreateSystemDefaultDevice();
|
123
138
|
ctx->queue = [ctx->device newCommandQueue];
|
124
139
|
ctx->n_buffers = 0;
|
125
140
|
ctx->concur_list_len = 0;
|
@@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
205
220
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
206
221
|
GGML_METAL_ADD_KERNEL(norm);
|
207
222
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
223
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
208
224
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
209
225
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
210
226
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
270
286
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
271
287
|
GGML_METAL_DEL_KERNEL(norm);
|
272
288
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
289
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
273
290
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
274
291
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
275
292
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -310,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
310
327
|
|
311
328
|
void * ggml_metal_host_malloc(size_t n) {
|
312
329
|
void * data = NULL;
|
313
|
-
const int result = posix_memalign((void **) &data,
|
330
|
+
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
314
331
|
if (result != 0) {
|
315
332
|
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
316
333
|
return NULL;
|
@@ -384,7 +401,7 @@ bool ggml_metal_add_buffer(
|
|
384
401
|
}
|
385
402
|
}
|
386
403
|
|
387
|
-
const size_t size_page =
|
404
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
388
405
|
|
389
406
|
size_t size_aligned = size;
|
390
407
|
if ((size_aligned % size_page) != 0) {
|
@@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
|
|
854
871
|
{
|
855
872
|
nth0 = 32;
|
856
873
|
nth1 = 1;
|
857
|
-
|
874
|
+
if (ne11 * ne12 < 4) {
|
875
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
876
|
+
} else {
|
877
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
878
|
+
}
|
858
879
|
} break;
|
859
880
|
case GGML_TYPE_Q4_0:
|
860
881
|
{
|
@@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
|
|
906
927
|
GGML_ASSERT(ne02 == 1);
|
907
928
|
GGML_ASSERT(ne12 == 1);
|
908
929
|
|
909
|
-
nth0 =
|
910
|
-
nth1 = 32;
|
930
|
+
nth0 = 4; //1;
|
931
|
+
nth1 = 8; //32;
|
911
932
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
912
933
|
} break;
|
913
934
|
case GGML_TYPE_Q5_K:
|
@@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
|
|
955
976
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
956
977
|
|
957
978
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
958
|
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
979
|
+
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
959
980
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
960
981
|
}
|
982
|
+
else if (src0t == GGML_TYPE_Q4_K) {
|
983
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
984
|
+
}
|
961
985
|
else if (src0t == GGML_TYPE_Q3_K) {
|
962
986
|
#ifdef GGML_QKK_64
|
963
987
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
|
|
971
995
|
else if (src0t == GGML_TYPE_Q6_K) {
|
972
996
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
973
997
|
} else {
|
974
|
-
|
975
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
998
|
+
int64_t ny = (ne11 + 3)/4;
|
999
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
976
1000
|
}
|
977
1001
|
}
|
978
1002
|
} break;
|
@@ -1117,7 +1141,7 @@ void ggml_metal_graph_compute(
|
|
1117
1141
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
1118
1142
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
1119
1143
|
|
1120
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(
|
1144
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
1121
1145
|
} break;
|
1122
1146
|
case GGML_OP_DUP:
|
1123
1147
|
case GGML_OP_CPY:
|