llama_cpp 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
- data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
3
+ metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
4
+ data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
5
5
  SHA512:
6
- metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
- data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
6
+ metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
7
+ data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
2
+
3
+ - Bump bundled llama.cpp from master-b1140 to master-b1198.
4
+
1
5
  ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
6
 
3
7
  **Breaking Changes**
@@ -1,3 +1,8 @@
1
+ // defines MAP_ANONYMOUS
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
1
6
  #include "ggml-alloc.h"
2
7
  #include "ggml.h"
3
8
  #include <assert.h>
@@ -6,6 +11,26 @@
6
11
  #include <stdlib.h>
7
12
  #include <string.h>
8
13
 
14
+ #ifdef __has_include
15
+ #if __has_include(<unistd.h>)
16
+ #include <unistd.h>
17
+ #if defined(_POSIX_MAPPED_FILES)
18
+ #include <sys/types.h>
19
+ #include <sys/mman.h>
20
+ #endif
21
+ #endif
22
+ #endif
23
+
24
+ #if defined(_WIN32)
25
+ #define WIN32_LEAN_AND_MEAN
26
+ #ifndef NOMINMAX
27
+ #define NOMINMAX
28
+ #endif
29
+ #include <windows.h>
30
+ #include <memoryapi.h>
31
+ #endif
32
+
33
+
9
34
  #define UNUSED(x) (void)(x)
10
35
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
36
  #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
99
124
  }
100
125
  #endif
101
126
 
102
-
103
- static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
127
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
128
  return ggml_nbytes(tensor);
105
129
 
106
130
  UNUSED(alloc);
107
131
  }
108
132
 
133
+ // check if a tensor is allocated by this buffer
134
+ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
135
+ void * ptr = tensor->data;
136
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
137
+ }
138
+
109
139
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
140
  #ifdef GGML_ALLOCATOR_DEBUG
111
- GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
141
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
112
142
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
143
  #endif
114
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
144
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
115
145
  size = aligned_offset(NULL, size, alloc->alignment);
116
146
 
117
147
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -135,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
135
165
  if (best_fit_block == -1) {
136
166
  // the last block is our last resort
137
167
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
168
+ max_avail = MAX(max_avail, block->size);
138
169
  if (block->size >= size) {
139
170
  best_fit_block = alloc->n_free_blocks - 1;
140
- max_avail = MAX(max_avail, block->size);
141
171
  } else {
142
172
  fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
143
173
  __func__, size, max_avail);
144
174
  GGML_ASSERT(!"not enough space in the buffer");
145
- return;
175
+ return;
146
176
  }
147
177
  }
148
178
  struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
177
207
  }
178
208
 
179
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
180
- static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
210
+ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
181
211
  void * ptr = tensor->data;
182
212
 
183
- if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
213
+ if (ggml_allocr_is_own(alloc, tensor) == false) {
184
214
  // the tensor was not allocated in this buffer
185
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
186
216
  // the easiest way to deal with this is just to ignore it
187
217
  return;
188
218
  }
189
219
 
190
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
220
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
191
221
  size = aligned_offset(NULL, size, alloc->alignment);
192
222
  AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
193
223
 
@@ -281,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
281
311
  return alloc;
282
312
  }
283
313
 
284
- // address and size of the buffer when measuring
285
- // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
286
- static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
287
- static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
314
+ // OS specific functions to allocate and free uncommitted virtual memory
315
+ static void * alloc_vmem(size_t size) {
316
+ #if defined(_WIN32)
317
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
318
+ #elif defined(_POSIX_MAPPED_FILES)
319
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
320
+ if (ptr == MAP_FAILED) {
321
+ return NULL;
322
+ }
323
+ return ptr;
324
+ #else
325
+ // use a fixed address for other platforms
326
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
327
+ return (void *)base_addr;
328
+ #endif
329
+ }
330
+
331
+ static void free_vmem(void * base_addr, size_t size) {
332
+ #if defined(_WIN32)
333
+ VirtualFree(base_addr, 0, MEM_RELEASE);
334
+ UNUSED(size);
335
+ #elif defined(_POSIX_MAPPED_FILES)
336
+ munmap(base_addr, size);
337
+ #else
338
+ // nothing to do
339
+ UNUSED(base_addr);
340
+ UNUSED(size);
341
+ #endif
342
+ }
343
+
344
+ // allocate uncommitted virtual memory to measure the size of the graph
345
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
346
+ // 1TB for 64-bit, 1GB for 32-bit
347
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
348
+ do {
349
+ *base_addr = alloc_vmem(*size);
350
+ if (*base_addr != NULL) {
351
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
352
+ return;
353
+ }
354
+ // try again with half the size
355
+ *size /= 2;
356
+ } while (*size > 0);
357
+
358
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
359
+ }
360
+
361
+ static void free_measure_vmem(void * base_addr, size_t size) {
362
+ free_vmem(base_addr, size);
363
+ }
288
364
 
289
365
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
290
366
  struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
291
367
 
368
+ void * base_addr;
369
+ size_t size;
370
+
371
+ alloc_measure_vmem(&base_addr, &size);
372
+
292
373
  *alloc = (struct ggml_allocr){
293
- /*.data = */ MEASURE_BASE_ADDR,
294
- /*.size = */ MEASURE_MAX_SIZE,
374
+ /*.data = */ base_addr,
375
+ /*.size = */ size,
295
376
  /*.alignment = */ alignment,
296
377
  /*.n_free_blocks = */ 0,
297
378
  /*.free_blocks = */ {{0}},
@@ -311,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
311
392
  }
312
393
 
313
394
  void ggml_allocr_free(struct ggml_allocr * alloc) {
395
+ if (alloc->measure) {
396
+ free_measure_vmem(alloc->data, alloc->size);
397
+ }
314
398
  free(alloc);
315
399
  }
316
400
 
@@ -380,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
380
464
  }
381
465
 
382
466
  // if the node's data is external, then we cannot re-use it
383
- if ((char *) parent->data < (char *) alloc->data ||
384
- (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
467
+ if (ggml_allocr_is_own(alloc, parent) == false) {
385
468
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
386
469
  continue;
387
470
  }
@@ -415,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
415
498
  }
416
499
  }
417
500
 
418
- static size_t ggml_allocator_alloc_graph_tensors_n(
501
+ static size_t ggml_allocr_alloc_graph_tensors_n(
419
502
  struct ggml_allocr * alloc,
420
503
  struct ggml_cgraph ** graphs, int n_graphs,
421
504
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -493,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
493
576
  AT_PRINTF("\n");
494
577
  }
495
578
 
496
-
497
579
  // update parents
498
580
  // update immediately if there is no parse_seq
499
581
  // update only at barriers if there is parse_seq
500
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
582
+ if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
501
583
  int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
502
584
  int update_end = alloc->parse_seq_len ? ind : ind + 1;
503
585
  for (int i = update_start; i < update_end; i++) {
@@ -521,12 +603,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
521
603
  view_src_hn->n_views -= 1;
522
604
  AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
523
605
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
524
- ggml_allocator_free_tensor(alloc, view_src);
606
+ ggml_allocr_free_tensor(alloc, view_src);
525
607
  }
526
608
  }
527
609
  else {
528
610
  if (parent->data != node->data) {
529
- ggml_allocator_free_tensor(alloc, parent);
611
+ ggml_allocr_free_tensor(alloc, parent);
530
612
  }
531
613
  }
532
614
  }
@@ -543,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
543
625
  for (int i = 0; outputs[g][i] != NULL; i++) {
544
626
  struct ggml_tensor * output = outputs[g][i];
545
627
  AT_PRINTF("output: %s\n", output->name);
546
- ggml_allocator_free_tensor(alloc, output);
628
+ ggml_allocr_free_tensor(alloc, output);
547
629
  }
548
630
  }
549
631
  }
@@ -552,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
552
634
  }
553
635
 
554
636
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
555
- return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
637
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
556
638
  }
@@ -81,12 +81,29 @@
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #ifndef __has_builtin
85
+ #define __has_builtin(x) 0
86
+ #endif
87
+
84
88
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
89
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
90
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
91
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
93
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
94
  return reinterpret_cast<const int&>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0; i < 4; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
102
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast<int&>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
107
  }
91
108
 
92
109
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
464
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
465
  }
449
466
 
467
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
468
+ #pragma unroll
469
+ for (int mask = 16; mask > 0; mask >>= 1) {
470
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
471
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
472
+ }
473
+ return a;
474
+ }
475
+
476
+ template <int block_size>
450
477
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
478
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
479
  const int tid = threadIdx.x;
453
480
 
454
481
  const float eps = 1e-5f;
455
482
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
483
+ float2 mean_var = make_float2(0.f, 0.f);
458
484
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
485
+ for (int col = tid; col < ncols; col += block_size) {
460
486
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
487
+ mean_var.x += xi;
488
+ mean_var.y += xi * xi;
463
489
  }
464
490
 
465
491
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
492
+ mean_var = warp_reduce_sum(mean_var);
493
+ if (block_size > WARP_SIZE) {
494
+ __shared__ float2 s_sum[32];
495
+ int warp_id = threadIdx.x / WARP_SIZE;
496
+ int lane_id = threadIdx.x % WARP_SIZE;
497
+ if (lane_id == 0) {
498
+ s_sum[warp_id] = mean_var;
499
+ }
500
+ __syncthreads();
501
+ mean_var = s_sum[lane_id];
502
+ mean_var = warp_reduce_sum(mean_var);
470
503
  }
471
504
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
505
+ const float mean = mean_var.x / ncols;
506
+ const float var = mean_var.y / ncols - mean * mean;
507
+ const float inv_std = rsqrtf(var + eps);
508
+
509
+ for (int col = tid; col < ncols; col += block_size) {
510
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
511
+ }
512
+ }
475
513
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
514
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
515
+ #pragma unroll
516
+ for (int mask = 16; mask > 0; mask >>= 1) {
517
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
518
  }
519
+ return x;
479
520
  }
480
521
 
522
+ template <int block_size>
481
523
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
524
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
525
  const int tid = threadIdx.x;
484
526
 
485
527
  float tmp = 0.0f; // partial sum for thread in warp
486
528
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
529
+ for (int col = tid; col < ncols; col += block_size) {
488
530
  const float xi = x[row*ncols + col];
489
531
  tmp += xi * xi;
490
532
  }
491
533
 
492
534
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
535
+ tmp = warp_reduce_sum(tmp);
536
+ if (block_size > WARP_SIZE) {
537
+ __shared__ float s_sum[32];
538
+ int warp_id = threadIdx.x / WARP_SIZE;
539
+ int lane_id = threadIdx.x % WARP_SIZE;
540
+ if (lane_id == 0) {
541
+ s_sum[warp_id] = tmp;
542
+ }
543
+ __syncthreads();
544
+ tmp = s_sum[lane_id];
545
+ tmp = warp_reduce_sum(tmp);
496
546
  }
497
547
 
498
548
  const float mean = tmp / ncols;
499
549
  const float scale = rsqrtf(mean + eps);
500
550
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
551
+ for (int col = tid; col < ncols; col += block_size) {
502
552
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
553
  }
504
554
  }
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4186
4236
 
4187
4237
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4188
4238
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4239
+ if (ncols < 1024) {
4240
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4241
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4242
+ } else {
4243
+ const dim3 block_dims(1024, 1, 1);
4244
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4245
+ }
4191
4246
  }
4192
4247
 
4193
4248
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4194
4249
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4195
- const dim3 block_dims(WARP_SIZE, 1, 1);
4196
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4250
+ if (ncols < 1024) {
4251
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4252
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4253
+ } else {
4254
+ const dim3 block_dims(1024, 1, 1);
4255
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4256
+ }
4197
4257
  }
4198
4258
 
4199
4259
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -76,6 +76,7 @@ struct ggml_metal_context {
76
76
  GGML_METAL_DECL_KERNEL(rms_norm);
77
77
  GGML_METAL_DECL_KERNEL(norm);
78
78
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
79
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
79
80
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
80
81
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
81
82
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
116
117
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
117
118
  metal_printf("%s: allocating\n", __func__);
118
119
 
119
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
120
+ // Show all the Metal device instances in the system
121
+ NSArray * devices = MTLCopyAllDevices();
122
+ id <MTLDevice> device;
123
+ NSString * s;
124
+ for (device in devices) {
125
+ s = [device name];
126
+ metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
127
+ }
120
128
 
129
+ // Pick and show default Metal device
130
+ device = MTLCreateSystemDefaultDevice();
131
+ s = [device name];
132
+ metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
133
+
134
+ // Configure context
135
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
136
+ ctx->device = device;
121
137
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
122
- ctx->device = MTLCreateSystemDefaultDevice();
123
138
  ctx->queue = [ctx->device newCommandQueue];
124
139
  ctx->n_buffers = 0;
125
140
  ctx->concur_list_len = 0;
@@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
205
220
  GGML_METAL_ADD_KERNEL(rms_norm);
206
221
  GGML_METAL_ADD_KERNEL(norm);
207
222
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
223
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
208
224
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
209
225
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
210
226
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
270
286
  GGML_METAL_DEL_KERNEL(rms_norm);
271
287
  GGML_METAL_DEL_KERNEL(norm);
272
288
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
289
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
273
290
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
274
291
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
275
292
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -310,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
310
327
 
311
328
  void * ggml_metal_host_malloc(size_t n) {
312
329
  void * data = NULL;
313
- const int result = posix_memalign((void **) &data, getpagesize(), n);
330
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
314
331
  if (result != 0) {
315
332
  metal_printf("%s: error: posix_memalign failed\n", __func__);
316
333
  return NULL;
@@ -384,7 +401,7 @@ bool ggml_metal_add_buffer(
384
401
  }
385
402
  }
386
403
 
387
- const size_t size_page = getpagesize();
404
+ const size_t size_page = sysconf(_SC_PAGESIZE);
388
405
 
389
406
  size_t size_aligned = size;
390
407
  if ((size_aligned % size_page) != 0) {
@@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
854
871
  {
855
872
  nth0 = 32;
856
873
  nth1 = 1;
857
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
874
+ if (ne11 * ne12 < 4) {
875
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
876
+ } else {
877
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
878
+ }
858
879
  } break;
859
880
  case GGML_TYPE_Q4_0:
860
881
  {
@@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
906
927
  GGML_ASSERT(ne02 == 1);
907
928
  GGML_ASSERT(ne12 == 1);
908
929
 
909
- nth0 = 2;
910
- nth1 = 32;
930
+ nth0 = 4; //1;
931
+ nth1 = 8; //32;
911
932
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
912
933
  } break;
913
934
  case GGML_TYPE_Q5_K:
@@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
955
976
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
956
977
 
957
978
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
958
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
979
+ src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
959
980
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
960
981
  }
982
+ else if (src0t == GGML_TYPE_Q4_K) {
983
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
984
+ }
961
985
  else if (src0t == GGML_TYPE_Q3_K) {
962
986
  #ifdef GGML_QKK_64
963
987
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
971
995
  else if (src0t == GGML_TYPE_Q6_K) {
972
996
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
973
997
  } else {
974
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
975
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
998
+ int64_t ny = (ne11 + 3)/4;
999
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
976
1000
  }
977
1001
  }
978
1002
  } break;
@@ -1117,7 +1141,7 @@ void ggml_metal_graph_compute(
1117
1141
  [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
1118
1142
  [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
1119
1143
 
1120
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1144
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
1121
1145
  } break;
1122
1146
  case GGML_OP_DUP:
1123
1147
  case GGML_OP_CPY: