llama_cpp 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,9 +34,13 @@ extern "C" {
34
34
 
35
35
  struct ggml_metal_context;
36
36
 
37
- struct ggml_metal_context * ggml_metal_init(void);
37
+ // number of command buffers to use
38
+ struct ggml_metal_context * ggml_metal_init(int n_cb);
38
39
  void ggml_metal_free(struct ggml_metal_context * ctx);
39
40
 
41
+ // set the number of command buffers to use
42
+ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
43
+
40
44
  // creates a mapping between a host memory buffer and a device memory buffer
41
45
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
46
  // - the mapping is used during computation to determine the arguments of the compute kernels
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
25
25
  };
26
26
 
27
27
  struct ggml_metal_context {
28
+ int n_cb;
29
+
28
30
  float * logits;
29
31
 
30
32
  id<MTLDevice> device;
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
86
88
  @implementation GGMLMetalClass
87
89
  @end
88
90
 
89
- struct ggml_metal_context * ggml_metal_init(void) {
91
+ struct ggml_metal_context * ggml_metal_init(int n_cb) {
90
92
  fprintf(stderr, "%s: allocating\n", __func__);
91
93
 
92
94
  struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
93
95
 
96
+ ctx->n_cb = n_cb;
94
97
  ctx->device = MTLCreateSystemDefaultDevice();
95
98
  ctx->queue = [ctx->device newCommandQueue];
96
99
  ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
208
211
  free(ctx);
209
212
  }
210
213
 
214
+ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
215
+ ctx->n_cb = n_cb;
216
+ }
217
+
211
218
  // finds the Metal buffer that contains the tensor data on the GPU device
212
219
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
213
220
  // Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
354
361
  // create multiple command buffers and enqueue them
355
362
  // then, we encode the graph into the command buffers in parallel
356
363
 
357
- const int n_cb = gf->n_threads;
364
+ const int n_cb = ctx->n_cb;
358
365
 
359
366
  NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
360
367
 
@@ -386,8 +393,8 @@ void ggml_metal_graph_compute(
386
393
  for (int i = node_start; i < node_end; ++i) {
387
394
  metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
388
395
 
389
- struct ggml_tensor * src0 = gf->nodes[i]->src0;
390
- struct ggml_tensor * src1 = gf->nodes[i]->src1;
396
+ struct ggml_tensor * src0 = gf->nodes[i]->src[0];
397
+ struct ggml_tensor * src1 = gf->nodes[i]->src[1];
391
398
  struct ggml_tensor * dst = gf->nodes[i];
392
399
 
393
400
  const int64_t ne00 = src0 ? src0->ne[0] : 0;
@@ -443,6 +450,7 @@ void ggml_metal_graph_compute(
443
450
  //}
444
451
 
445
452
  switch (dst->op) {
453
+ case GGML_OP_NONE:
446
454
  case GGML_OP_RESHAPE:
447
455
  case GGML_OP_VIEW:
448
456
  case GGML_OP_TRANSPOSE:
@@ -731,7 +739,10 @@ void ggml_metal_graph_compute(
731
739
  [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
732
740
  [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
733
741
 
734
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
742
+ if (src0t == GGML_TYPE_Q4_0) {
743
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
744
+ }
745
+ else if (src0t == GGML_TYPE_Q4_1) {
735
746
  [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
736
747
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
737
748
  }
@@ -365,6 +365,10 @@ kernel void kernel_rms_norm(
365
365
  }
366
366
  }
367
367
 
368
+ // putting them in the kernel cause a significant performance penalty
369
+ #define N_DST 4 // each SIMD group works on 4 rows
370
+ #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
371
+ #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
368
372
  kernel void kernel_mul_mat_q4_0_f32(
369
373
  device const void * src0,
370
374
  device const float * src1,
@@ -372,64 +376,69 @@ kernel void kernel_mul_mat_q4_0_f32(
372
376
  constant int64_t & ne00,
373
377
  constant int64_t & ne10,
374
378
  constant int64_t & ne0,
375
- threadgroup float * sum [[threadgroup(0)]],
379
+ constant int64_t & ne01[[buffer(4)]],
376
380
  uint2 tgpig[[threadgroup_position_in_grid]],
377
- uint2 tpitg[[thread_position_in_threadgroup]],
378
- uint2 tptg[[threads_per_threadgroup]]) {
381
+ uint tiisg[[thread_index_in_simdgroup]],
382
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
379
383
  const int nb = ne00/QK4_0;
380
-
381
- const int64_t r0 = tgpig.x;
382
- const int64_t r1 = tgpig.y;
383
-
384
- device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
384
+ const int r0 = tgpig.x;
385
+ const int r1 = tgpig.y;
386
+ device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
385
387
  device const float * y = (device const float *) src1 + r1*ne10;
388
+ block_q4_0 qb_curr, qb_next;
389
+ float4 y_curr[8]; // src1 vector cache
390
+ float sumf[N_DST]={0.f}, all_sum;
391
+ thread float * yl=(thread float *)y_curr;
392
+
393
+ // bootstrap
394
+ qb_curr = x[tiisg];
395
+ // each thread in a SIMD group deals with 1 block.
396
+ for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
397
+
398
+ for (int i = 0; i < QK4_0 / 4; i++) {
399
+ y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
400
+ }
386
401
 
387
- const int nth = tptg.x*tptg.y;
388
- const int ith = tptg.y*tpitg.x + tpitg.y;
389
-
390
- const int ix = tpitg.y/4; // 0 or 1
391
- const int iy = tpitg.y - 4*ix; // 0...3
392
-
393
- const int first = 4 * iy;
394
-
395
- float sumf = 0;
402
+ for (int row = 0; row < N_DST; row++) {
403
+ // prefetch next x block
404
+ qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
396
405
 
397
- for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
406
+ // calculate
407
+ float d = qb_curr.d;
408
+ float2 acc = {0.0f, 0.0f};
409
+ for (int i = 0; i < 16; i++) {
410
+ acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
411
+ acc[1] += yl[i] + yl[i+16];
412
+ }
413
+ sumf[row] += d * (acc[0] - 8.f*acc[1]);
414
+ qb_curr = qb_next;
415
+ }
416
+ }
398
417
 
399
- const float d = (float)x[i].d;
418
+ for (int i = 0; i < QK4_0 / 4; i++) {
419
+ y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
420
+ }
400
421
 
401
- device const uint8_t * xl = x[i].qs + first;
402
- device const float * yl = y + i * QK4_0 + first;
422
+ for (int row = 0; row < N_DST; row++) {
423
+ // prefetch next x block
424
+ qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
403
425
 
426
+ // calculate
427
+ float d = qb_curr.d;
404
428
  float2 acc = {0.0f, 0.0f};
405
-
406
- for (int j = 0; j < 4; ++j) {
407
-
408
- acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
409
- acc[1] += yl[j] + yl[j+16];
410
-
429
+ for (int i = 0; i < 16; i++) {
430
+ acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
431
+ acc[1] += yl[i] + yl[i+16];
411
432
  }
433
+ if (tiisg < nb % N_SIMDWIDTH) {
434
+ sumf[row] += d * (acc[0] - 8.f*acc[1]);
435
+ }
436
+ qb_curr = qb_next;
412
437
 
413
- sumf += d * (acc[0] - 8.f*acc[1]);
414
- }
415
-
416
- sum[ith] = sumf;
417
-
418
- //
419
- // Accumulate the sum from all threads in the threadgroup
420
- //
421
- threadgroup_barrier(mem_flags::mem_threadgroup);
422
- if (ith%4 == 0) {
423
- sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
424
- }
425
- threadgroup_barrier(mem_flags::mem_threadgroup);
426
- if (ith%16 == 0) {
427
- sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
428
- }
429
- threadgroup_barrier(mem_flags::mem_threadgroup);
430
- if (ith == 0) {
431
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
432
- dst[r1*ne0 + r0] = sum[0];
438
+ all_sum = simd_sum(sumf[row]);
439
+ if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
440
+ dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
441
+ }
433
442
  }
434
443
  }
435
444
 
@@ -0,0 +1,216 @@
1
+ #include "ggml-mpi.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <mpi.h>
6
+
7
+ #include <stdio.h>
8
+ #include <stdlib.h>
9
+
10
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
11
+
12
+ #define UNUSED GGML_UNUSED
13
+
14
+ struct ggml_mpi_context {
15
+ int rank;
16
+ int size;
17
+ };
18
+
19
+ void ggml_mpi_backend_init(void) {
20
+ MPI_Init(NULL, NULL);
21
+ }
22
+
23
+ void ggml_mpi_backend_free(void) {
24
+ MPI_Finalize();
25
+ }
26
+
27
+ struct ggml_mpi_context * ggml_mpi_init(void) {
28
+ struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
29
+
30
+ MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
31
+ MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
32
+
33
+ return ctx;
34
+ }
35
+
36
+ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
37
+ free(ctx);
38
+ }
39
+
40
+ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
41
+ return ctx->rank;
42
+ }
43
+
44
+ void ggml_mpi_eval_init(
45
+ struct ggml_mpi_context * ctx_mpi,
46
+ int * n_tokens,
47
+ int * n_past,
48
+ int * n_threads) {
49
+ UNUSED(ctx_mpi);
50
+
51
+ // synchronize the worker node parameters with the root node
52
+ MPI_Barrier(MPI_COMM_WORLD);
53
+
54
+ MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
55
+ MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
56
+ MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
57
+ }
58
+
59
+ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
60
+ struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
61
+ if (t == NULL) {
62
+ fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
63
+ return -1;
64
+ }
65
+
66
+ for (int i = 0; i < gf->n_nodes; i++) {
67
+ if (gf->nodes[i] == t) {
68
+ return i;
69
+ }
70
+ }
71
+
72
+ fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
73
+ return -1;
74
+ }
75
+
76
+ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
77
+ MPI_Datatype mpi_type;
78
+
79
+ switch (t->type) {
80
+ case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
81
+ case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
82
+ default: GGML_ASSERT(false && "not implemented");
83
+ }
84
+
85
+ const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
86
+ GGML_ASSERT(retval == MPI_SUCCESS);
87
+ }
88
+
89
+ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
90
+ MPI_Datatype mpi_type;
91
+
92
+ switch (t->type) {
93
+ case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
94
+ case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
95
+ default: GGML_ASSERT(false && "not implemented");
96
+ }
97
+
98
+ MPI_Status status; UNUSED(status);
99
+
100
+ const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
101
+ GGML_ASSERT(retval == MPI_SUCCESS);
102
+ }
103
+
104
+ // TODO: there are many improvements that can be done to this implementation
105
+ void ggml_mpi_graph_compute_pre(
106
+ struct ggml_mpi_context * ctx_mpi,
107
+ struct ggml_cgraph * gf,
108
+ int n_layers) {
109
+ const int mpi_rank = ctx_mpi->rank;
110
+ const int mpi_size = ctx_mpi->size;
111
+
112
+ struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
113
+ if (inp_tokens == NULL) {
114
+ fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
115
+ return;
116
+ }
117
+
118
+ struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
119
+ if (inp0 == NULL) {
120
+ fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
121
+ return;
122
+ }
123
+
124
+ GGML_ASSERT(inp0 == gf->nodes[0]);
125
+
126
+ // distribute the compute graph into slices across the MPI nodes
127
+ //
128
+ // the main node (0) processes the last layers + the remainder of the compute graph
129
+ // and is responsible to pass the input tokens to the first node (1)
130
+ //
131
+ // node 1: [( 0) * n_per_node, ( 1) * n_per_node)
132
+ // node 2: [( 1) * n_per_node, ( 2) * n_per_node)
133
+ // ...
134
+ // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
135
+ // node 0: [(n-1) * n_per_node, n_nodes)
136
+ //
137
+ if (mpi_rank > 0) {
138
+ if (mpi_rank == 1) {
139
+ // the first node (1) receives the input tokens from the main node (0)
140
+ ggml_mpi_tensor_recv(inp_tokens, 0);
141
+ } else {
142
+ // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
143
+ ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
144
+ }
145
+ } else if (mpi_size > 1) {
146
+ // node 0 sends the input tokens to node 1
147
+ ggml_mpi_tensor_send(inp_tokens, 1);
148
+
149
+ // recv the output data from the last node
150
+ ggml_mpi_tensor_recv(inp0, mpi_size - 1);
151
+ }
152
+
153
+ {
154
+ const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
155
+
156
+ const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
157
+
158
+ const int il0 = (mpi_idx + 0) * n_per_node;
159
+ const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
160
+
161
+ char name_l0[GGML_MAX_NAME];
162
+ char name_l1[GGML_MAX_NAME];
163
+
164
+ snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
165
+ snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
166
+
167
+ const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
168
+ const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
169
+
170
+ if (idx_l0 < 0 || idx_l1 < 0) {
171
+ fprintf(stderr, "%s: layer input nodes not found\n", __func__);
172
+ return;
173
+ }
174
+
175
+ // attach the input data to all nodes that need it
176
+ // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
177
+ for (int i = idx_l0; i < idx_l1; i++) {
178
+ if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
179
+ gf->nodes[i]->src[0] = inp0;
180
+ }
181
+ if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
182
+ gf->nodes[i]->src[1] = inp0;
183
+ }
184
+ }
185
+
186
+ // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
187
+ for (int i = 1; i < idx_l1 - idx_l0; i++) {
188
+ gf->nodes[i] = gf->nodes[idx_l0 + i];
189
+ gf->grads[i] = gf->grads[idx_l0 + i];
190
+ }
191
+
192
+ // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
193
+ if (mpi_idx != 0) {
194
+ gf->nodes[0]->op = GGML_OP_NONE;
195
+ }
196
+
197
+ gf->n_nodes = idx_l1 - idx_l0;
198
+
199
+ //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
200
+ }
201
+ }
202
+
203
+ void ggml_mpi_graph_compute_post(
204
+ struct ggml_mpi_context * ctx_mpi,
205
+ struct ggml_cgraph * gf,
206
+ int n_layers) {
207
+ UNUSED(n_layers);
208
+
209
+ const int mpi_rank = ctx_mpi->rank;
210
+ const int mpi_size = ctx_mpi->size;
211
+
212
+ // send the output data to the next node
213
+ if (mpi_rank > 0) {
214
+ ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
215
+ }
216
+ }
@@ -0,0 +1,39 @@
1
+ #pragma once
2
+
3
+ struct ggml_context;
4
+ struct ggml_tensor;
5
+ struct ggml_cgraph;
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ struct ggml_mpi_context;
12
+
13
+ void ggml_mpi_backend_init(void);
14
+ void ggml_mpi_backend_free(void);
15
+
16
+ struct ggml_mpi_context * ggml_mpi_init(void);
17
+ void ggml_mpi_free(struct ggml_mpi_context * ctx);
18
+
19
+ int ggml_mpi_rank(struct ggml_mpi_context * ctx);
20
+
21
+ void ggml_mpi_eval_init(
22
+ struct ggml_mpi_context * ctx_mpi,
23
+ int * n_tokens,
24
+ int * n_past,
25
+ int * n_threads);
26
+
27
+ void ggml_mpi_graph_compute_pre(
28
+ struct ggml_mpi_context * ctx_mpi,
29
+ struct ggml_cgraph * gf,
30
+ int n_layers);
31
+
32
+ void ggml_mpi_graph_compute_post(
33
+ struct ggml_mpi_context * ctx_mpi,
34
+ struct ggml_cgraph * gf,
35
+ int n_layers);
36
+
37
+ #ifdef __cplusplus
38
+ }
39
+ #endif