llama_cpp 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
|
162
162
|
|
163
163
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164
164
|
|
165
|
+
//
|
166
|
+
// end of logging block
|
167
|
+
//
|
168
|
+
|
165
169
|
#ifdef GGML_USE_ACCELERATE
|
166
170
|
// uncomment to use vDSP for soft max computation
|
167
171
|
// note: not sure if it is actually faster
|
168
172
|
//#define GGML_SOFT_MAX_ACCELERATE
|
169
173
|
#endif
|
170
174
|
|
171
|
-
//
|
172
|
-
// logging
|
173
|
-
//
|
174
|
-
|
175
|
-
#if (GGML_DEBUG >= 1)
|
176
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177
|
-
#else
|
178
|
-
#define GGML_PRINT_DEBUG(...)
|
179
|
-
#endif
|
180
|
-
|
181
|
-
#if (GGML_DEBUG >= 5)
|
182
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183
|
-
#else
|
184
|
-
#define GGML_PRINT_DEBUG_5(...)
|
185
|
-
#endif
|
186
|
-
|
187
|
-
#if (GGML_DEBUG >= 10)
|
188
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189
|
-
#else
|
190
|
-
#define GGML_PRINT_DEBUG_10(...)
|
191
|
-
#endif
|
192
|
-
|
193
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194
|
-
|
195
|
-
//
|
196
|
-
// end of logging block
|
197
|
-
//
|
198
|
-
|
199
175
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200
176
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201
177
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4951
4927
|
*result = (struct ggml_tensor) {
|
4952
4928
|
/*.type =*/ type,
|
4953
4929
|
/*.backend =*/ GGML_BACKEND_CPU,
|
4930
|
+
/*.buffer =*/ NULL,
|
4954
4931
|
/*.n_dims =*/ n_dims,
|
4955
4932
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4956
4933
|
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
|
|
5517
5494
|
return result;
|
5518
5495
|
}
|
5519
5496
|
|
5497
|
+
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
5498
|
+
struct ggml_object * obj = ctx->objects_begin;
|
5499
|
+
|
5500
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5501
|
+
|
5502
|
+
while (obj != NULL) {
|
5503
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5504
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5505
|
+
}
|
5506
|
+
|
5507
|
+
obj = obj->next;
|
5508
|
+
}
|
5509
|
+
|
5510
|
+
return NULL;
|
5511
|
+
}
|
5512
|
+
|
5513
|
+
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
5514
|
+
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
5515
|
+
obj = obj->next;
|
5516
|
+
|
5517
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5518
|
+
|
5519
|
+
while (obj != NULL) {
|
5520
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5521
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5522
|
+
}
|
5523
|
+
|
5524
|
+
obj = obj->next;
|
5525
|
+
}
|
5526
|
+
|
5527
|
+
return NULL;
|
5528
|
+
}
|
5529
|
+
|
5520
5530
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
5521
5531
|
struct ggml_object * obj = ctx->objects_begin;
|
5522
5532
|
|
@@ -8670,6 +8680,7 @@ void ggml_set_param(
|
|
8670
8680
|
|
8671
8681
|
GGML_ASSERT(tensor->grad == NULL);
|
8672
8682
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
8683
|
+
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
|
8673
8684
|
}
|
8674
8685
|
|
8675
8686
|
// ggml_compute_forward_dup
|
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11256
11267
|
|
11257
11268
|
#ifndef NDEBUG
|
11258
11269
|
for (int k = 0; k < nc; k++) {
|
11259
|
-
const float x = ((float *) ((char *) dst->data + i1*(
|
11270
|
+
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11260
11271
|
UNUSED(x);
|
11261
11272
|
assert(!isnan(x));
|
11262
11273
|
assert(!isinf(x));
|
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
|
|
13082
13093
|
return;
|
13083
13094
|
}
|
13084
13095
|
|
13085
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
13096
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13086
13097
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
13087
13098
|
float max_bias;
|
13088
13099
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13089
13100
|
|
13090
|
-
|
13091
|
-
|
13092
|
-
const
|
13093
|
-
const
|
13094
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13095
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13101
|
+
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13102
|
+
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13103
|
+
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13104
|
+
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13096
13105
|
|
13097
|
-
const
|
13098
|
-
const
|
13106
|
+
const int64_t n = ggml_nrows(src0);
|
13107
|
+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13099
13108
|
|
13100
|
-
const
|
13101
|
-
const
|
13102
|
-
const
|
13109
|
+
const size_t nb0 = src0->nb[0];
|
13110
|
+
const size_t nb1 = src0->nb[1];
|
13111
|
+
const size_t nb2 = src0->nb[2];
|
13103
13112
|
//const int nb3 = src0->nb[3];
|
13104
13113
|
|
13105
13114
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13111
13120
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13112
13121
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13113
13122
|
|
13114
|
-
for (
|
13115
|
-
for (
|
13116
|
-
for (
|
13123
|
+
for (int64_t i = 0; i < ne0; i++) {
|
13124
|
+
for (int64_t j = 0; j < ne1; j++) {
|
13125
|
+
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13117
13126
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13118
13127
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13119
13128
|
|
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
|
|
13128
13137
|
}
|
13129
13138
|
|
13130
13139
|
pdst[0] = i * m_k + src[0];
|
13131
|
-
|
13132
13140
|
}
|
13133
13141
|
}
|
13134
13142
|
}
|
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|
14454
14462
|
int64_t t0 = ggml_perf_time_us();
|
14455
14463
|
UNUSED(t0);
|
14456
14464
|
|
14457
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
14465
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
14458
14466
|
|
14459
14467
|
const int ith = params->ith;
|
14460
14468
|
const int nth = params->nth;
|
@@ -20203,6 +20211,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20203
20211
|
ggml_vec_cpy_f32(nx, xp, x);
|
20204
20212
|
ggml_vec_cpy_f32(nx, gp, g);
|
20205
20213
|
|
20214
|
+
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20215
|
+
// to determine if the optimization should be cancelled
|
20216
|
+
// this is a simple change, but not doing this atm, since I don't have a nice
|
20217
|
+
// way to test and don't want to break something with so many changes lined up
|
20206
20218
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20207
20219
|
if (cancel) {
|
20208
20220
|
return GGML_OPT_CANCEL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -326,7 +326,7 @@ extern "C" {
|
|
326
326
|
GGML_TYPE_COUNT,
|
327
327
|
};
|
328
328
|
|
329
|
-
enum
|
329
|
+
enum ggml_backend_type {
|
330
330
|
GGML_BACKEND_CPU = 0,
|
331
331
|
GGML_BACKEND_GPU = 10,
|
332
332
|
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,8 +479,10 @@ extern "C" {
|
|
479
479
|
|
480
480
|
// n-dimensional tensor
|
481
481
|
struct ggml_tensor {
|
482
|
-
enum ggml_type
|
483
|
-
enum
|
482
|
+
enum ggml_type type;
|
483
|
+
enum ggml_backend_type backend;
|
484
|
+
|
485
|
+
struct ggml_backend_buffer * buffer;
|
484
486
|
|
485
487
|
int n_dims;
|
486
488
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -514,7 +516,7 @@ extern "C" {
|
|
514
516
|
|
515
517
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
516
518
|
|
517
|
-
char padding[
|
519
|
+
char padding[12];
|
518
520
|
};
|
519
521
|
|
520
522
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -702,6 +704,9 @@ extern "C" {
|
|
702
704
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
703
705
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
704
706
|
|
707
|
+
// Context tensor enumeration and lookup
|
708
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
709
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
705
710
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
706
711
|
|
707
712
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -1358,7 +1363,7 @@ extern "C" {
|
|
1358
1363
|
|
1359
1364
|
// alibi position embedding
|
1360
1365
|
// in-place, returns view(a)
|
1361
|
-
struct ggml_tensor * ggml_alibi(
|
1366
|
+
GGML_API struct ggml_tensor * ggml_alibi(
|
1362
1367
|
struct ggml_context * ctx,
|
1363
1368
|
struct ggml_tensor * a,
|
1364
1369
|
int n_past,
|
@@ -1367,7 +1372,7 @@ extern "C" {
|
|
1367
1372
|
|
1368
1373
|
// clamp
|
1369
1374
|
// in-place, returns view(a)
|
1370
|
-
struct ggml_tensor * ggml_clamp(
|
1375
|
+
GGML_API struct ggml_tensor * ggml_clamp(
|
1371
1376
|
struct ggml_context * ctx,
|
1372
1377
|
struct ggml_tensor * a,
|
1373
1378
|
float min,
|
@@ -2102,7 +2107,7 @@ extern "C" {
|
|
2102
2107
|
enum ggml_type vec_dot_type;
|
2103
2108
|
} ggml_type_traits_t;
|
2104
2109
|
|
2105
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2110
|
+
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106
2111
|
|
2107
2112
|
#ifdef __cplusplus
|
2108
2113
|
}
|
@@ -29,7 +29,7 @@
|
|
29
29
|
|
30
30
|
// 2-bit quantization
|
31
31
|
// weight is represented as x = a * q + b
|
32
|
-
// 16 blocks of 16
|
32
|
+
// 16 blocks of 16 elements each
|
33
33
|
// Effectively 2.5625 bits per weight
|
34
34
|
typedef struct {
|
35
35
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
41
41
|
|
42
42
|
// 3-bit quantization
|
43
43
|
// weight is represented as x = a * q
|
44
|
-
// 16 blocks of 16
|
44
|
+
// 16 blocks of 16 elements each
|
45
45
|
// Effectively 3.4375 bits per weight
|
46
46
|
#ifdef GGML_QKK_64
|
47
47
|
typedef struct {
|
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
|
62
62
|
#endif
|
63
63
|
|
64
64
|
// 4-bit quantization
|
65
|
-
//
|
65
|
+
// 8 blocks of 32 elements each
|
66
66
|
// weight is represented as x = a * q + b
|
67
67
|
// Effectively 4.5 bits per weight
|
68
68
|
#ifdef GGML_QKK_64
|
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
83
83
|
#endif
|
84
84
|
|
85
85
|
// 5-bit quantization
|
86
|
-
//
|
86
|
+
// 8 blocks of 32 elements each
|
87
87
|
// weight is represented as x = a * q + b
|
88
88
|
// Effectively 5.5 bits per weight
|
89
89
|
#ifdef GGML_QKK_64
|
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
107
107
|
|
108
108
|
// 6-bit quantization
|
109
109
|
// weight is represented as x = a * q
|
110
|
-
// 16 blocks of 16
|
110
|
+
// 16 blocks of 16 elements each
|
111
111
|
// Effectively 6.5625 bits per weight
|
112
112
|
typedef struct {
|
113
113
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|