llama_cpp 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
- data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
3
+ metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
4
+ data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
5
5
  SHA512:
6
- metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
- data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
6
+ metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
7
+ data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
2
+
3
+ - Bump bundled llama.cpp from master-b1140 to master-b1198.
4
+
1
5
  ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
6
 
3
7
  **Breaking Changes**
@@ -1,3 +1,8 @@
1
+ // defines MAP_ANONYMOUS
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
1
6
  #include "ggml-alloc.h"
2
7
  #include "ggml.h"
3
8
  #include <assert.h>
@@ -6,6 +11,26 @@
6
11
  #include <stdlib.h>
7
12
  #include <string.h>
8
13
 
14
+ #ifdef __has_include
15
+ #if __has_include(<unistd.h>)
16
+ #include <unistd.h>
17
+ #if defined(_POSIX_MAPPED_FILES)
18
+ #include <sys/types.h>
19
+ #include <sys/mman.h>
20
+ #endif
21
+ #endif
22
+ #endif
23
+
24
+ #if defined(_WIN32)
25
+ #define WIN32_LEAN_AND_MEAN
26
+ #ifndef NOMINMAX
27
+ #define NOMINMAX
28
+ #endif
29
+ #include <windows.h>
30
+ #include <memoryapi.h>
31
+ #endif
32
+
33
+
9
34
  #define UNUSED(x) (void)(x)
10
35
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
36
  #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
99
124
  }
100
125
  #endif
101
126
 
102
-
103
- static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
127
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
128
  return ggml_nbytes(tensor);
105
129
 
106
130
  UNUSED(alloc);
107
131
  }
108
132
 
133
+ // check if a tensor is allocated by this buffer
134
+ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
135
+ void * ptr = tensor->data;
136
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
137
+ }
138
+
109
139
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
140
  #ifdef GGML_ALLOCATOR_DEBUG
111
- GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
141
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
112
142
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
143
  #endif
114
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
144
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
115
145
  size = aligned_offset(NULL, size, alloc->alignment);
116
146
 
117
147
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -135,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
135
165
  if (best_fit_block == -1) {
136
166
  // the last block is our last resort
137
167
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
168
+ max_avail = MAX(max_avail, block->size);
138
169
  if (block->size >= size) {
139
170
  best_fit_block = alloc->n_free_blocks - 1;
140
- max_avail = MAX(max_avail, block->size);
141
171
  } else {
142
172
  fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
143
173
  __func__, size, max_avail);
144
174
  GGML_ASSERT(!"not enough space in the buffer");
145
- return;
175
+ return;
146
176
  }
147
177
  }
148
178
  struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
177
207
  }
178
208
 
179
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
180
- static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
210
+ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
181
211
  void * ptr = tensor->data;
182
212
 
183
- if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
213
+ if (ggml_allocr_is_own(alloc, tensor) == false) {
184
214
  // the tensor was not allocated in this buffer
185
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
186
216
  // the easiest way to deal with this is just to ignore it
187
217
  return;
188
218
  }
189
219
 
190
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
220
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
191
221
  size = aligned_offset(NULL, size, alloc->alignment);
192
222
  AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
193
223
 
@@ -281,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
281
311
  return alloc;
282
312
  }
283
313
 
284
- // address and size of the buffer when measuring
285
- // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
286
- static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
287
- static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
314
+ // OS specific functions to allocate and free uncommitted virtual memory
315
+ static void * alloc_vmem(size_t size) {
316
+ #if defined(_WIN32)
317
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
318
+ #elif defined(_POSIX_MAPPED_FILES)
319
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
320
+ if (ptr == MAP_FAILED) {
321
+ return NULL;
322
+ }
323
+ return ptr;
324
+ #else
325
+ // use a fixed address for other platforms
326
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
327
+ return (void *)base_addr;
328
+ #endif
329
+ }
330
+
331
+ static void free_vmem(void * base_addr, size_t size) {
332
+ #if defined(_WIN32)
333
+ VirtualFree(base_addr, 0, MEM_RELEASE);
334
+ UNUSED(size);
335
+ #elif defined(_POSIX_MAPPED_FILES)
336
+ munmap(base_addr, size);
337
+ #else
338
+ // nothing to do
339
+ UNUSED(base_addr);
340
+ UNUSED(size);
341
+ #endif
342
+ }
343
+
344
+ // allocate uncommitted virtual memory to measure the size of the graph
345
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
346
+ // 1TB for 64-bit, 1GB for 32-bit
347
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
348
+ do {
349
+ *base_addr = alloc_vmem(*size);
350
+ if (*base_addr != NULL) {
351
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
352
+ return;
353
+ }
354
+ // try again with half the size
355
+ *size /= 2;
356
+ } while (*size > 0);
357
+
358
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
359
+ }
360
+
361
+ static void free_measure_vmem(void * base_addr, size_t size) {
362
+ free_vmem(base_addr, size);
363
+ }
288
364
 
289
365
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
290
366
  struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
291
367
 
368
+ void * base_addr;
369
+ size_t size;
370
+
371
+ alloc_measure_vmem(&base_addr, &size);
372
+
292
373
  *alloc = (struct ggml_allocr){
293
- /*.data = */ MEASURE_BASE_ADDR,
294
- /*.size = */ MEASURE_MAX_SIZE,
374
+ /*.data = */ base_addr,
375
+ /*.size = */ size,
295
376
  /*.alignment = */ alignment,
296
377
  /*.n_free_blocks = */ 0,
297
378
  /*.free_blocks = */ {{0}},
@@ -311,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
311
392
  }
312
393
 
313
394
  void ggml_allocr_free(struct ggml_allocr * alloc) {
395
+ if (alloc->measure) {
396
+ free_measure_vmem(alloc->data, alloc->size);
397
+ }
314
398
  free(alloc);
315
399
  }
316
400
 
@@ -380,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
380
464
  }
381
465
 
382
466
  // if the node's data is external, then we cannot re-use it
383
- if ((char *) parent->data < (char *) alloc->data ||
384
- (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
467
+ if (ggml_allocr_is_own(alloc, parent) == false) {
385
468
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
386
469
  continue;
387
470
  }
@@ -415,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
415
498
  }
416
499
  }
417
500
 
418
- static size_t ggml_allocator_alloc_graph_tensors_n(
501
+ static size_t ggml_allocr_alloc_graph_tensors_n(
419
502
  struct ggml_allocr * alloc,
420
503
  struct ggml_cgraph ** graphs, int n_graphs,
421
504
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -493,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
493
576
  AT_PRINTF("\n");
494
577
  }
495
578
 
496
-
497
579
  // update parents
498
580
  // update immediately if there is no parse_seq
499
581
  // update only at barriers if there is parse_seq
500
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
582
+ if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
501
583
  int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
502
584
  int update_end = alloc->parse_seq_len ? ind : ind + 1;
503
585
  for (int i = update_start; i < update_end; i++) {
@@ -521,12 +603,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
521
603
  view_src_hn->n_views -= 1;
522
604
  AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
523
605
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
524
- ggml_allocator_free_tensor(alloc, view_src);
606
+ ggml_allocr_free_tensor(alloc, view_src);
525
607
  }
526
608
  }
527
609
  else {
528
610
  if (parent->data != node->data) {
529
- ggml_allocator_free_tensor(alloc, parent);
611
+ ggml_allocr_free_tensor(alloc, parent);
530
612
  }
531
613
  }
532
614
  }
@@ -543,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
543
625
  for (int i = 0; outputs[g][i] != NULL; i++) {
544
626
  struct ggml_tensor * output = outputs[g][i];
545
627
  AT_PRINTF("output: %s\n", output->name);
546
- ggml_allocator_free_tensor(alloc, output);
628
+ ggml_allocr_free_tensor(alloc, output);
547
629
  }
548
630
  }
549
631
  }
@@ -552,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
552
634
  }
553
635
 
554
636
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
555
- return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
637
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
556
638
  }
@@ -81,12 +81,29 @@
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #ifndef __has_builtin
85
+ #define __has_builtin(x) 0
86
+ #endif
87
+
84
88
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
89
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
90
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
91
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
93
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
94
  return reinterpret_cast<const int&>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0; i < 4; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
102
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast<int&>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
107
  }
91
108
 
92
109
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
464
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
465
  }
449
466
 
467
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
468
+ #pragma unroll
469
+ for (int mask = 16; mask > 0; mask >>= 1) {
470
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
471
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
472
+ }
473
+ return a;
474
+ }
475
+
476
+ template <int block_size>
450
477
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
478
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
479
  const int tid = threadIdx.x;
453
480
 
454
481
  const float eps = 1e-5f;
455
482
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
483
+ float2 mean_var = make_float2(0.f, 0.f);
458
484
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
485
+ for (int col = tid; col < ncols; col += block_size) {
460
486
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
487
+ mean_var.x += xi;
488
+ mean_var.y += xi * xi;
463
489
  }
464
490
 
465
491
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
492
+ mean_var = warp_reduce_sum(mean_var);
493
+ if (block_size > WARP_SIZE) {
494
+ __shared__ float2 s_sum[32];
495
+ int warp_id = threadIdx.x / WARP_SIZE;
496
+ int lane_id = threadIdx.x % WARP_SIZE;
497
+ if (lane_id == 0) {
498
+ s_sum[warp_id] = mean_var;
499
+ }
500
+ __syncthreads();
501
+ mean_var = s_sum[lane_id];
502
+ mean_var = warp_reduce_sum(mean_var);
470
503
  }
471
504
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
505
+ const float mean = mean_var.x / ncols;
506
+ const float var = mean_var.y / ncols - mean * mean;
507
+ const float inv_std = rsqrtf(var + eps);
508
+
509
+ for (int col = tid; col < ncols; col += block_size) {
510
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
511
+ }
512
+ }
475
513
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
514
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
515
+ #pragma unroll
516
+ for (int mask = 16; mask > 0; mask >>= 1) {
517
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
518
  }
519
+ return x;
479
520
  }
480
521
 
522
+ template <int block_size>
481
523
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
524
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
525
  const int tid = threadIdx.x;
484
526
 
485
527
  float tmp = 0.0f; // partial sum for thread in warp
486
528
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
529
+ for (int col = tid; col < ncols; col += block_size) {
488
530
  const float xi = x[row*ncols + col];
489
531
  tmp += xi * xi;
490
532
  }
491
533
 
492
534
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
535
+ tmp = warp_reduce_sum(tmp);
536
+ if (block_size > WARP_SIZE) {
537
+ __shared__ float s_sum[32];
538
+ int warp_id = threadIdx.x / WARP_SIZE;
539
+ int lane_id = threadIdx.x % WARP_SIZE;
540
+ if (lane_id == 0) {
541
+ s_sum[warp_id] = tmp;
542
+ }
543
+ __syncthreads();
544
+ tmp = s_sum[lane_id];
545
+ tmp = warp_reduce_sum(tmp);
496
546
  }
497
547
 
498
548
  const float mean = tmp / ncols;
499
549
  const float scale = rsqrtf(mean + eps);
500
550
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
551
+ for (int col = tid; col < ncols; col += block_size) {
502
552
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
553
  }
504
554
  }
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4186
4236
 
4187
4237
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4188
4238
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4239
+ if (ncols < 1024) {
4240
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4241
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4242
+ } else {
4243
+ const dim3 block_dims(1024, 1, 1);
4244
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4245
+ }
4191
4246
  }
4192
4247
 
4193
4248
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4194
4249
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4195
- const dim3 block_dims(WARP_SIZE, 1, 1);
4196
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4250
+ if (ncols < 1024) {
4251
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4252
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4253
+ } else {
4254
+ const dim3 block_dims(1024, 1, 1);
4255
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4256
+ }
4197
4257
  }
4198
4258
 
4199
4259
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -76,6 +76,7 @@ struct ggml_metal_context {
76
76
  GGML_METAL_DECL_KERNEL(rms_norm);
77
77
  GGML_METAL_DECL_KERNEL(norm);
78
78
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
79
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
79
80
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
80
81
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
81
82
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
116
117
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
117
118
  metal_printf("%s: allocating\n", __func__);
118
119
 
119
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
120
+ // Show all the Metal device instances in the system
121
+ NSArray * devices = MTLCopyAllDevices();
122
+ id <MTLDevice> device;
123
+ NSString * s;
124
+ for (device in devices) {
125
+ s = [device name];
126
+ metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
127
+ }
120
128
 
129
+ // Pick and show default Metal device
130
+ device = MTLCreateSystemDefaultDevice();
131
+ s = [device name];
132
+ metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
133
+
134
+ // Configure context
135
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
136
+ ctx->device = device;
121
137
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
122
- ctx->device = MTLCreateSystemDefaultDevice();
123
138
  ctx->queue = [ctx->device newCommandQueue];
124
139
  ctx->n_buffers = 0;
125
140
  ctx->concur_list_len = 0;
@@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
205
220
  GGML_METAL_ADD_KERNEL(rms_norm);
206
221
  GGML_METAL_ADD_KERNEL(norm);
207
222
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
223
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
208
224
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
209
225
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
210
226
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
270
286
  GGML_METAL_DEL_KERNEL(rms_norm);
271
287
  GGML_METAL_DEL_KERNEL(norm);
272
288
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
289
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
273
290
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
274
291
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
275
292
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -310,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
310
327
 
311
328
  void * ggml_metal_host_malloc(size_t n) {
312
329
  void * data = NULL;
313
- const int result = posix_memalign((void **) &data, getpagesize(), n);
330
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
314
331
  if (result != 0) {
315
332
  metal_printf("%s: error: posix_memalign failed\n", __func__);
316
333
  return NULL;
@@ -384,7 +401,7 @@ bool ggml_metal_add_buffer(
384
401
  }
385
402
  }
386
403
 
387
- const size_t size_page = getpagesize();
404
+ const size_t size_page = sysconf(_SC_PAGESIZE);
388
405
 
389
406
  size_t size_aligned = size;
390
407
  if ((size_aligned % size_page) != 0) {
@@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
854
871
  {
855
872
  nth0 = 32;
856
873
  nth1 = 1;
857
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
874
+ if (ne11 * ne12 < 4) {
875
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
876
+ } else {
877
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
878
+ }
858
879
  } break;
859
880
  case GGML_TYPE_Q4_0:
860
881
  {
@@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
906
927
  GGML_ASSERT(ne02 == 1);
907
928
  GGML_ASSERT(ne12 == 1);
908
929
 
909
- nth0 = 2;
910
- nth1 = 32;
930
+ nth0 = 4; //1;
931
+ nth1 = 8; //32;
911
932
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
912
933
  } break;
913
934
  case GGML_TYPE_Q5_K:
@@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
955
976
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
956
977
 
957
978
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
958
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
979
+ src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
959
980
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
960
981
  }
982
+ else if (src0t == GGML_TYPE_Q4_K) {
983
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
984
+ }
961
985
  else if (src0t == GGML_TYPE_Q3_K) {
962
986
  #ifdef GGML_QKK_64
963
987
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
971
995
  else if (src0t == GGML_TYPE_Q6_K) {
972
996
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
973
997
  } else {
974
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
975
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
998
+ int64_t ny = (ne11 + 3)/4;
999
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
976
1000
  }
977
1001
  }
978
1002
  } break;
@@ -1117,7 +1141,7 @@ void ggml_metal_graph_compute(
1117
1141
  [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
1118
1142
  [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
1119
1143
 
1120
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1144
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
1121
1145
  } break;
1122
1146
  case GGML_OP_DUP:
1123
1147
  case GGML_OP_CPY: