llama_cpp 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
@@ -8,10 +8,6 @@ extern "C" {
|
|
8
8
|
|
9
9
|
#define GGML_CUDA_MAX_DEVICES 16
|
10
10
|
|
11
|
-
struct ggml_tensor_extra_gpu {
|
12
|
-
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
13
|
-
};
|
14
|
-
|
15
11
|
void ggml_init_cublas(void);
|
16
12
|
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
17
13
|
|
@@ -34,9 +34,13 @@ extern "C" {
|
|
34
34
|
|
35
35
|
struct ggml_metal_context;
|
36
36
|
|
37
|
-
|
37
|
+
// number of command buffers to use
|
38
|
+
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
38
39
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
39
40
|
|
41
|
+
// set the number of command buffers to use
|
42
|
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
43
|
+
|
40
44
|
// creates a mapping between a host memory buffer and a device memory buffer
|
41
45
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
46
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
|
|
25
25
|
};
|
26
26
|
|
27
27
|
struct ggml_metal_context {
|
28
|
+
int n_cb;
|
29
|
+
|
28
30
|
float * logits;
|
29
31
|
|
30
32
|
id<MTLDevice> device;
|
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
86
88
|
@implementation GGMLMetalClass
|
87
89
|
@end
|
88
90
|
|
89
|
-
struct ggml_metal_context * ggml_metal_init(
|
91
|
+
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
90
92
|
fprintf(stderr, "%s: allocating\n", __func__);
|
91
93
|
|
92
94
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
93
95
|
|
96
|
+
ctx->n_cb = n_cb;
|
94
97
|
ctx->device = MTLCreateSystemDefaultDevice();
|
95
98
|
ctx->queue = [ctx->device newCommandQueue];
|
96
99
|
ctx->n_buffers = 0;
|
@@ -202,10 +205,16 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
202
205
|
|
203
206
|
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
204
207
|
fprintf(stderr, "%s: deallocating\n", __func__);
|
205
|
-
|
208
|
+
for (int i = 0; i < ctx->n_buffers; ++i) {
|
209
|
+
[ctx->buffers[i].metal release];
|
210
|
+
}
|
206
211
|
free(ctx);
|
207
212
|
}
|
208
213
|
|
214
|
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
215
|
+
ctx->n_cb = n_cb;
|
216
|
+
}
|
217
|
+
|
209
218
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
210
219
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
211
220
|
// Metal buffer based on the host memory pointer
|
@@ -352,7 +361,7 @@ void ggml_metal_graph_compute(
|
|
352
361
|
// create multiple command buffers and enqueue them
|
353
362
|
// then, we encode the graph into the command buffers in parallel
|
354
363
|
|
355
|
-
const int n_cb =
|
364
|
+
const int n_cb = ctx->n_cb;
|
356
365
|
|
357
366
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
358
367
|
|
@@ -384,8 +393,8 @@ void ggml_metal_graph_compute(
|
|
384
393
|
for (int i = node_start; i < node_end; ++i) {
|
385
394
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
386
395
|
|
387
|
-
struct ggml_tensor * src0 = gf->nodes[i]->
|
388
|
-
struct ggml_tensor * src1 = gf->nodes[i]->
|
396
|
+
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
397
|
+
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
389
398
|
struct ggml_tensor * dst = gf->nodes[i];
|
390
399
|
|
391
400
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
@@ -441,6 +450,7 @@ void ggml_metal_graph_compute(
|
|
441
450
|
//}
|
442
451
|
|
443
452
|
switch (dst->op) {
|
453
|
+
case GGML_OP_NONE:
|
444
454
|
case GGML_OP_RESHAPE:
|
445
455
|
case GGML_OP_VIEW:
|
446
456
|
case GGML_OP_TRANSPOSE:
|
@@ -729,7 +739,10 @@ void ggml_metal_graph_compute(
|
|
729
739
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
730
740
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
731
741
|
|
732
|
-
if (src0t == GGML_TYPE_Q4_0
|
742
|
+
if (src0t == GGML_TYPE_Q4_0) {
|
743
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
744
|
+
}
|
745
|
+
else if (src0t == GGML_TYPE_Q4_1) {
|
733
746
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
734
747
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
735
748
|
}
|
@@ -365,6 +365,10 @@ kernel void kernel_rms_norm(
|
|
365
365
|
}
|
366
366
|
}
|
367
367
|
|
368
|
+
// putting them in the kernel cause a significant performance penalty
|
369
|
+
#define N_DST 4 // each SIMD group works on 4 rows
|
370
|
+
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
371
|
+
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
368
372
|
kernel void kernel_mul_mat_q4_0_f32(
|
369
373
|
device const void * src0,
|
370
374
|
device const float * src1,
|
@@ -372,64 +376,69 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|
372
376
|
constant int64_t & ne00,
|
373
377
|
constant int64_t & ne10,
|
374
378
|
constant int64_t & ne0,
|
375
|
-
|
379
|
+
constant int64_t & ne01[[buffer(4)]],
|
376
380
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
377
|
-
|
378
|
-
|
381
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
382
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
379
383
|
const int nb = ne00/QK4_0;
|
380
|
-
|
381
|
-
const
|
382
|
-
const
|
383
|
-
|
384
|
-
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
384
|
+
const int r0 = tgpig.x;
|
385
|
+
const int r1 = tgpig.y;
|
386
|
+
device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
385
387
|
device const float * y = (device const float *) src1 + r1*ne10;
|
388
|
+
block_q4_0 qb_curr, qb_next;
|
389
|
+
float4 y_curr[8]; // src1 vector cache
|
390
|
+
float sumf[N_DST]={0.f}, all_sum;
|
391
|
+
thread float * yl=(thread float *)y_curr;
|
392
|
+
|
393
|
+
// bootstrap
|
394
|
+
qb_curr = x[tiisg];
|
395
|
+
// each thread in a SIMD group deals with 1 block.
|
396
|
+
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
397
|
+
|
398
|
+
for (int i = 0; i < QK4_0 / 4; i++) {
|
399
|
+
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
400
|
+
}
|
386
401
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
const int ix = tpitg.y/4; // 0 or 1
|
391
|
-
const int iy = tpitg.y - 4*ix; // 0...3
|
392
|
-
|
393
|
-
const int first = 4 * iy;
|
394
|
-
|
395
|
-
float sumf = 0;
|
402
|
+
for (int row = 0; row < N_DST; row++) {
|
403
|
+
// prefetch next x block
|
404
|
+
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
396
405
|
|
397
|
-
|
406
|
+
// calculate
|
407
|
+
float d = qb_curr.d;
|
408
|
+
float2 acc = {0.0f, 0.0f};
|
409
|
+
for (int i = 0; i < 16; i++) {
|
410
|
+
acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
411
|
+
acc[1] += yl[i] + yl[i+16];
|
412
|
+
}
|
413
|
+
sumf[row] += d * (acc[0] - 8.f*acc[1]);
|
414
|
+
qb_curr = qb_next;
|
415
|
+
}
|
416
|
+
}
|
398
417
|
|
399
|
-
|
418
|
+
for (int i = 0; i < QK4_0 / 4; i++) {
|
419
|
+
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
420
|
+
}
|
400
421
|
|
401
|
-
|
402
|
-
|
422
|
+
for (int row = 0; row < N_DST; row++) {
|
423
|
+
// prefetch next x block
|
424
|
+
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
403
425
|
|
426
|
+
// calculate
|
427
|
+
float d = qb_curr.d;
|
404
428
|
float2 acc = {0.0f, 0.0f};
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
|
409
|
-
acc[1] += yl[j] + yl[j+16];
|
410
|
-
|
429
|
+
for (int i = 0; i < 16; i++) {
|
430
|
+
acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
431
|
+
acc[1] += yl[i] + yl[i+16];
|
411
432
|
}
|
433
|
+
if (tiisg < nb % N_SIMDWIDTH) {
|
434
|
+
sumf[row] += d * (acc[0] - 8.f*acc[1]);
|
435
|
+
}
|
436
|
+
qb_curr = qb_next;
|
412
437
|
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
//
|
419
|
-
// Accumulate the sum from all threads in the threadgroup
|
420
|
-
//
|
421
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
422
|
-
if (ith%4 == 0) {
|
423
|
-
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
424
|
-
}
|
425
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
426
|
-
if (ith%16 == 0) {
|
427
|
-
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
428
|
-
}
|
429
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
430
|
-
if (ith == 0) {
|
431
|
-
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
432
|
-
dst[r1*ne0 + r0] = sum[0];
|
438
|
+
all_sum = simd_sum(sumf[row]);
|
439
|
+
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
440
|
+
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
441
|
+
}
|
433
442
|
}
|
434
443
|
}
|
435
444
|
|
@@ -0,0 +1,216 @@
|
|
1
|
+
#include "ggml-mpi.h"
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
#include <mpi.h>
|
6
|
+
|
7
|
+
#include <stdio.h>
|
8
|
+
#include <stdlib.h>
|
9
|
+
|
10
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
11
|
+
|
12
|
+
#define UNUSED GGML_UNUSED
|
13
|
+
|
14
|
+
struct ggml_mpi_context {
|
15
|
+
int rank;
|
16
|
+
int size;
|
17
|
+
};
|
18
|
+
|
19
|
+
void ggml_mpi_backend_init(void) {
|
20
|
+
MPI_Init(NULL, NULL);
|
21
|
+
}
|
22
|
+
|
23
|
+
void ggml_mpi_backend_free(void) {
|
24
|
+
MPI_Finalize();
|
25
|
+
}
|
26
|
+
|
27
|
+
struct ggml_mpi_context * ggml_mpi_init(void) {
|
28
|
+
struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
|
29
|
+
|
30
|
+
MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
|
31
|
+
MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
|
32
|
+
|
33
|
+
return ctx;
|
34
|
+
}
|
35
|
+
|
36
|
+
void ggml_mpi_free(struct ggml_mpi_context * ctx) {
|
37
|
+
free(ctx);
|
38
|
+
}
|
39
|
+
|
40
|
+
int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
|
41
|
+
return ctx->rank;
|
42
|
+
}
|
43
|
+
|
44
|
+
void ggml_mpi_eval_init(
|
45
|
+
struct ggml_mpi_context * ctx_mpi,
|
46
|
+
int * n_tokens,
|
47
|
+
int * n_past,
|
48
|
+
int * n_threads) {
|
49
|
+
UNUSED(ctx_mpi);
|
50
|
+
|
51
|
+
// synchronize the worker node parameters with the root node
|
52
|
+
MPI_Barrier(MPI_COMM_WORLD);
|
53
|
+
|
54
|
+
MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
55
|
+
MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
56
|
+
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
57
|
+
}
|
58
|
+
|
59
|
+
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
|
60
|
+
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
|
61
|
+
if (t == NULL) {
|
62
|
+
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
|
63
|
+
return -1;
|
64
|
+
}
|
65
|
+
|
66
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
67
|
+
if (gf->nodes[i] == t) {
|
68
|
+
return i;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
|
77
|
+
MPI_Datatype mpi_type;
|
78
|
+
|
79
|
+
switch (t->type) {
|
80
|
+
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
81
|
+
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
82
|
+
default: GGML_ASSERT(false && "not implemented");
|
83
|
+
}
|
84
|
+
|
85
|
+
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
|
86
|
+
GGML_ASSERT(retval == MPI_SUCCESS);
|
87
|
+
}
|
88
|
+
|
89
|
+
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
|
90
|
+
MPI_Datatype mpi_type;
|
91
|
+
|
92
|
+
switch (t->type) {
|
93
|
+
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
94
|
+
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
95
|
+
default: GGML_ASSERT(false && "not implemented");
|
96
|
+
}
|
97
|
+
|
98
|
+
MPI_Status status; UNUSED(status);
|
99
|
+
|
100
|
+
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
101
|
+
GGML_ASSERT(retval == MPI_SUCCESS);
|
102
|
+
}
|
103
|
+
|
104
|
+
// TODO: there are many improvements that can be done to this implementation
|
105
|
+
void ggml_mpi_graph_compute_pre(
|
106
|
+
struct ggml_mpi_context * ctx_mpi,
|
107
|
+
struct ggml_cgraph * gf,
|
108
|
+
int n_layers) {
|
109
|
+
const int mpi_rank = ctx_mpi->rank;
|
110
|
+
const int mpi_size = ctx_mpi->size;
|
111
|
+
|
112
|
+
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
|
113
|
+
if (inp_tokens == NULL) {
|
114
|
+
fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
|
115
|
+
return;
|
116
|
+
}
|
117
|
+
|
118
|
+
struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
|
119
|
+
if (inp0 == NULL) {
|
120
|
+
fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
|
121
|
+
return;
|
122
|
+
}
|
123
|
+
|
124
|
+
GGML_ASSERT(inp0 == gf->nodes[0]);
|
125
|
+
|
126
|
+
// distribute the compute graph into slices across the MPI nodes
|
127
|
+
//
|
128
|
+
// the main node (0) processes the last layers + the remainder of the compute graph
|
129
|
+
// and is responsible to pass the input tokens to the first node (1)
|
130
|
+
//
|
131
|
+
// node 1: [( 0) * n_per_node, ( 1) * n_per_node)
|
132
|
+
// node 2: [( 1) * n_per_node, ( 2) * n_per_node)
|
133
|
+
// ...
|
134
|
+
// node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
|
135
|
+
// node 0: [(n-1) * n_per_node, n_nodes)
|
136
|
+
//
|
137
|
+
if (mpi_rank > 0) {
|
138
|
+
if (mpi_rank == 1) {
|
139
|
+
// the first node (1) receives the input tokens from the main node (0)
|
140
|
+
ggml_mpi_tensor_recv(inp_tokens, 0);
|
141
|
+
} else {
|
142
|
+
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
|
143
|
+
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
|
144
|
+
}
|
145
|
+
} else if (mpi_size > 1) {
|
146
|
+
// node 0 sends the input tokens to node 1
|
147
|
+
ggml_mpi_tensor_send(inp_tokens, 1);
|
148
|
+
|
149
|
+
// recv the output data from the last node
|
150
|
+
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
|
151
|
+
}
|
152
|
+
|
153
|
+
{
|
154
|
+
const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
|
155
|
+
|
156
|
+
const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
|
157
|
+
|
158
|
+
const int il0 = (mpi_idx + 0) * n_per_node;
|
159
|
+
const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
|
160
|
+
|
161
|
+
char name_l0[GGML_MAX_NAME];
|
162
|
+
char name_l1[GGML_MAX_NAME];
|
163
|
+
|
164
|
+
snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
|
165
|
+
snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
|
166
|
+
|
167
|
+
const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
|
168
|
+
const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
|
169
|
+
|
170
|
+
if (idx_l0 < 0 || idx_l1 < 0) {
|
171
|
+
fprintf(stderr, "%s: layer input nodes not found\n", __func__);
|
172
|
+
return;
|
173
|
+
}
|
174
|
+
|
175
|
+
// attach the input data to all nodes that need it
|
176
|
+
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
|
177
|
+
for (int i = idx_l0; i < idx_l1; i++) {
|
178
|
+
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
|
179
|
+
gf->nodes[i]->src[0] = inp0;
|
180
|
+
}
|
181
|
+
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
|
182
|
+
gf->nodes[i]->src[1] = inp0;
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
// TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
|
187
|
+
for (int i = 1; i < idx_l1 - idx_l0; i++) {
|
188
|
+
gf->nodes[i] = gf->nodes[idx_l0 + i];
|
189
|
+
gf->grads[i] = gf->grads[idx_l0 + i];
|
190
|
+
}
|
191
|
+
|
192
|
+
// the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
|
193
|
+
if (mpi_idx != 0) {
|
194
|
+
gf->nodes[0]->op = GGML_OP_NONE;
|
195
|
+
}
|
196
|
+
|
197
|
+
gf->n_nodes = idx_l1 - idx_l0;
|
198
|
+
|
199
|
+
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
void ggml_mpi_graph_compute_post(
|
204
|
+
struct ggml_mpi_context * ctx_mpi,
|
205
|
+
struct ggml_cgraph * gf,
|
206
|
+
int n_layers) {
|
207
|
+
UNUSED(n_layers);
|
208
|
+
|
209
|
+
const int mpi_rank = ctx_mpi->rank;
|
210
|
+
const int mpi_size = ctx_mpi->size;
|
211
|
+
|
212
|
+
// send the output data to the next node
|
213
|
+
if (mpi_rank > 0) {
|
214
|
+
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
|
215
|
+
}
|
216
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
struct ggml_context;
|
4
|
+
struct ggml_tensor;
|
5
|
+
struct ggml_cgraph;
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
struct ggml_mpi_context;
|
12
|
+
|
13
|
+
void ggml_mpi_backend_init(void);
|
14
|
+
void ggml_mpi_backend_free(void);
|
15
|
+
|
16
|
+
struct ggml_mpi_context * ggml_mpi_init(void);
|
17
|
+
void ggml_mpi_free(struct ggml_mpi_context * ctx);
|
18
|
+
|
19
|
+
int ggml_mpi_rank(struct ggml_mpi_context * ctx);
|
20
|
+
|
21
|
+
void ggml_mpi_eval_init(
|
22
|
+
struct ggml_mpi_context * ctx_mpi,
|
23
|
+
int * n_tokens,
|
24
|
+
int * n_past,
|
25
|
+
int * n_threads);
|
26
|
+
|
27
|
+
void ggml_mpi_graph_compute_pre(
|
28
|
+
struct ggml_mpi_context * ctx_mpi,
|
29
|
+
struct ggml_cgraph * gf,
|
30
|
+
int n_layers);
|
31
|
+
|
32
|
+
void ggml_mpi_graph_compute_post(
|
33
|
+
struct ggml_mpi_context * ctx_mpi,
|
34
|
+
struct ggml_cgraph * gf,
|
35
|
+
int n_layers);
|
36
|
+
|
37
|
+
#ifdef __cplusplus
|
38
|
+
}
|
39
|
+
#endif
|
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
653
653
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
654
654
|
const int in = tid - step*im; // 0...15 or 0...7
|
655
655
|
|
656
|
-
#if K_QUANTS_PER_ITERATION == 1
|
656
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
657
657
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
658
658
|
const int is = 0;
|
659
|
-
|
659
|
+
|
660
|
+
\n#else\n
|
661
|
+
|
660
662
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
661
663
|
const int is = in / 4;
|
662
|
-
|
664
|
+
|
665
|
+
\n#endif\n
|
666
|
+
|
663
667
|
const int ql_offset = 64*im + l0;
|
664
668
|
const int qh_offset = 32*im + l0;
|
665
669
|
const int s_offset = 8*im + is;
|
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
676
680
|
|
677
681
|
const float d = vload_half(0, &x[i].d);
|
678
682
|
|
679
|
-
#if K_QUANTS_PER_ITERATION == 1
|
683
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
680
684
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
681
685
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
682
686
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
686
690
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
687
691
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
688
692
|
tmp[16 * ix + tid] += sum;
|
689
|
-
#else
|
693
|
+
\n#else\n
|
690
694
|
float sum = 0;
|
691
695
|
for (int l = 0; l < 4; ++l) {
|
692
696
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
695
699
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
696
700
|
}
|
697
701
|
tmp[16 * ix + tid] += sum;
|
698
|
-
#endif
|
702
|
+
\n#endif\n
|
699
703
|
|
700
704
|
}
|
701
705
|
|
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1376
1380
|
const int64_t ne00 = src0->ne[0];
|
1377
1381
|
const int64_t ne01 = src0->ne[1];
|
1378
1382
|
const int64_t ne02 = src0->ne[2];
|
1379
|
-
const int64_t ne03 = src0->ne[
|
1383
|
+
const int64_t ne03 = src0->ne[3];
|
1380
1384
|
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1381
1385
|
const int64_t ne10 = src1->ne[0];
|
1382
1386
|
const int64_t ne11 = src1->ne[1];
|