llama_cpp 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
|
162
162
|
|
163
163
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164
164
|
|
165
|
+
//
|
166
|
+
// end of logging block
|
167
|
+
//
|
168
|
+
|
165
169
|
#ifdef GGML_USE_ACCELERATE
|
166
170
|
// uncomment to use vDSP for soft max computation
|
167
171
|
// note: not sure if it is actually faster
|
168
172
|
//#define GGML_SOFT_MAX_ACCELERATE
|
169
173
|
#endif
|
170
174
|
|
171
|
-
//
|
172
|
-
// logging
|
173
|
-
//
|
174
|
-
|
175
|
-
#if (GGML_DEBUG >= 1)
|
176
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177
|
-
#else
|
178
|
-
#define GGML_PRINT_DEBUG(...)
|
179
|
-
#endif
|
180
|
-
|
181
|
-
#if (GGML_DEBUG >= 5)
|
182
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183
|
-
#else
|
184
|
-
#define GGML_PRINT_DEBUG_5(...)
|
185
|
-
#endif
|
186
|
-
|
187
|
-
#if (GGML_DEBUG >= 10)
|
188
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189
|
-
#else
|
190
|
-
#define GGML_PRINT_DEBUG_10(...)
|
191
|
-
#endif
|
192
|
-
|
193
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194
|
-
|
195
|
-
//
|
196
|
-
// end of logging block
|
197
|
-
//
|
198
|
-
|
199
175
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200
176
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201
177
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4951
4927
|
*result = (struct ggml_tensor) {
|
4952
4928
|
/*.type =*/ type,
|
4953
4929
|
/*.backend =*/ GGML_BACKEND_CPU,
|
4930
|
+
/*.buffer =*/ NULL,
|
4954
4931
|
/*.n_dims =*/ n_dims,
|
4955
4932
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4956
4933
|
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
|
|
5517
5494
|
return result;
|
5518
5495
|
}
|
5519
5496
|
|
5497
|
+
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
5498
|
+
struct ggml_object * obj = ctx->objects_begin;
|
5499
|
+
|
5500
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5501
|
+
|
5502
|
+
while (obj != NULL) {
|
5503
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5504
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5505
|
+
}
|
5506
|
+
|
5507
|
+
obj = obj->next;
|
5508
|
+
}
|
5509
|
+
|
5510
|
+
return NULL;
|
5511
|
+
}
|
5512
|
+
|
5513
|
+
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
5514
|
+
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
5515
|
+
obj = obj->next;
|
5516
|
+
|
5517
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5518
|
+
|
5519
|
+
while (obj != NULL) {
|
5520
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5521
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5522
|
+
}
|
5523
|
+
|
5524
|
+
obj = obj->next;
|
5525
|
+
}
|
5526
|
+
|
5527
|
+
return NULL;
|
5528
|
+
}
|
5529
|
+
|
5520
5530
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
5521
5531
|
struct ggml_object * obj = ctx->objects_begin;
|
5522
5532
|
|
@@ -8670,6 +8680,7 @@ void ggml_set_param(
|
|
8670
8680
|
|
8671
8681
|
GGML_ASSERT(tensor->grad == NULL);
|
8672
8682
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
8683
|
+
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
|
8673
8684
|
}
|
8674
8685
|
|
8675
8686
|
// ggml_compute_forward_dup
|
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11256
11267
|
|
11257
11268
|
#ifndef NDEBUG
|
11258
11269
|
for (int k = 0; k < nc; k++) {
|
11259
|
-
const float x = ((float *) ((char *) dst->data + i1*(
|
11270
|
+
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11260
11271
|
UNUSED(x);
|
11261
11272
|
assert(!isnan(x));
|
11262
11273
|
assert(!isinf(x));
|
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
|
|
13082
13093
|
return;
|
13083
13094
|
}
|
13084
13095
|
|
13085
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
13096
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13086
13097
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
13087
13098
|
float max_bias;
|
13088
13099
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13089
13100
|
|
13090
|
-
|
13091
|
-
|
13092
|
-
const
|
13093
|
-
const
|
13094
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13095
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13101
|
+
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13102
|
+
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13103
|
+
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13104
|
+
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13096
13105
|
|
13097
|
-
const
|
13098
|
-
const
|
13106
|
+
const int64_t n = ggml_nrows(src0);
|
13107
|
+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13099
13108
|
|
13100
|
-
const
|
13101
|
-
const
|
13102
|
-
const
|
13109
|
+
const size_t nb0 = src0->nb[0];
|
13110
|
+
const size_t nb1 = src0->nb[1];
|
13111
|
+
const size_t nb2 = src0->nb[2];
|
13103
13112
|
//const int nb3 = src0->nb[3];
|
13104
13113
|
|
13105
13114
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13111
13120
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13112
13121
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13113
13122
|
|
13114
|
-
for (
|
13115
|
-
for (
|
13116
|
-
for (
|
13123
|
+
for (int64_t i = 0; i < ne0; i++) {
|
13124
|
+
for (int64_t j = 0; j < ne1; j++) {
|
13125
|
+
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13117
13126
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13118
13127
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13119
13128
|
|
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
|
|
13128
13137
|
}
|
13129
13138
|
|
13130
13139
|
pdst[0] = i * m_k + src[0];
|
13131
|
-
|
13132
13140
|
}
|
13133
13141
|
}
|
13134
13142
|
}
|
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|
14454
14462
|
int64_t t0 = ggml_perf_time_us();
|
14455
14463
|
UNUSED(t0);
|
14456
14464
|
|
14457
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
14465
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
14458
14466
|
|
14459
14467
|
const int ith = params->ith;
|
14460
14468
|
const int nth = params->nth;
|
@@ -20203,6 +20211,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20203
20211
|
ggml_vec_cpy_f32(nx, xp, x);
|
20204
20212
|
ggml_vec_cpy_f32(nx, gp, g);
|
20205
20213
|
|
20214
|
+
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20215
|
+
// to determine if the optimization should be cancelled
|
20216
|
+
// this is a simple change, but not doing this atm, since I don't have a nice
|
20217
|
+
// way to test and don't want to break something with so many changes lined up
|
20206
20218
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20207
20219
|
if (cancel) {
|
20208
20220
|
return GGML_OPT_CANCEL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -326,7 +326,7 @@ extern "C" {
|
|
326
326
|
GGML_TYPE_COUNT,
|
327
327
|
};
|
328
328
|
|
329
|
-
enum
|
329
|
+
enum ggml_backend_type {
|
330
330
|
GGML_BACKEND_CPU = 0,
|
331
331
|
GGML_BACKEND_GPU = 10,
|
332
332
|
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,8 +479,10 @@ extern "C" {
|
|
479
479
|
|
480
480
|
// n-dimensional tensor
|
481
481
|
struct ggml_tensor {
|
482
|
-
enum ggml_type
|
483
|
-
enum
|
482
|
+
enum ggml_type type;
|
483
|
+
enum ggml_backend_type backend;
|
484
|
+
|
485
|
+
struct ggml_backend_buffer * buffer;
|
484
486
|
|
485
487
|
int n_dims;
|
486
488
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -514,7 +516,7 @@ extern "C" {
|
|
514
516
|
|
515
517
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
516
518
|
|
517
|
-
char padding[
|
519
|
+
char padding[12];
|
518
520
|
};
|
519
521
|
|
520
522
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -702,6 +704,9 @@ extern "C" {
|
|
702
704
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
703
705
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
704
706
|
|
707
|
+
// Context tensor enumeration and lookup
|
708
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
709
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
705
710
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
706
711
|
|
707
712
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -1358,7 +1363,7 @@ extern "C" {
|
|
1358
1363
|
|
1359
1364
|
// alibi position embedding
|
1360
1365
|
// in-place, returns view(a)
|
1361
|
-
struct ggml_tensor * ggml_alibi(
|
1366
|
+
GGML_API struct ggml_tensor * ggml_alibi(
|
1362
1367
|
struct ggml_context * ctx,
|
1363
1368
|
struct ggml_tensor * a,
|
1364
1369
|
int n_past,
|
@@ -1367,7 +1372,7 @@ extern "C" {
|
|
1367
1372
|
|
1368
1373
|
// clamp
|
1369
1374
|
// in-place, returns view(a)
|
1370
|
-
struct ggml_tensor * ggml_clamp(
|
1375
|
+
GGML_API struct ggml_tensor * ggml_clamp(
|
1371
1376
|
struct ggml_context * ctx,
|
1372
1377
|
struct ggml_tensor * a,
|
1373
1378
|
float min,
|
@@ -2102,7 +2107,7 @@ extern "C" {
|
|
2102
2107
|
enum ggml_type vec_dot_type;
|
2103
2108
|
} ggml_type_traits_t;
|
2104
2109
|
|
2105
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2110
|
+
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106
2111
|
|
2107
2112
|
#ifdef __cplusplus
|
2108
2113
|
}
|
@@ -29,7 +29,7 @@
|
|
29
29
|
|
30
30
|
// 2-bit quantization
|
31
31
|
// weight is represented as x = a * q + b
|
32
|
-
// 16 blocks of 16
|
32
|
+
// 16 blocks of 16 elements each
|
33
33
|
// Effectively 2.5625 bits per weight
|
34
34
|
typedef struct {
|
35
35
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
41
41
|
|
42
42
|
// 3-bit quantization
|
43
43
|
// weight is represented as x = a * q
|
44
|
-
// 16 blocks of 16
|
44
|
+
// 16 blocks of 16 elements each
|
45
45
|
// Effectively 3.4375 bits per weight
|
46
46
|
#ifdef GGML_QKK_64
|
47
47
|
typedef struct {
|
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
|
62
62
|
#endif
|
63
63
|
|
64
64
|
// 4-bit quantization
|
65
|
-
//
|
65
|
+
// 8 blocks of 32 elements each
|
66
66
|
// weight is represented as x = a * q + b
|
67
67
|
// Effectively 4.5 bits per weight
|
68
68
|
#ifdef GGML_QKK_64
|
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
83
83
|
#endif
|
84
84
|
|
85
85
|
// 5-bit quantization
|
86
|
-
//
|
86
|
+
// 8 blocks of 32 elements each
|
87
87
|
// weight is represented as x = a * q + b
|
88
88
|
// Effectively 5.5 bits per weight
|
89
89
|
#ifdef GGML_QKK_64
|
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
107
107
|
|
108
108
|
// 6-bit quantization
|
109
109
|
// weight is represented as x = a * q
|
110
|
-
// 16 blocks of 16
|
110
|
+
// 16 blocks of 16 elements each
|
111
111
|
// Effectively 6.5625 bits per weight
|
112
112
|
typedef struct {
|
113
113
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|