llama_cpp 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -201,6 +201,8 @@
|
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
#define GGML_UNUSED(x) (void)(x)
|
205
|
+
|
204
206
|
#define GGML_ASSERT(x) \
|
205
207
|
do { \
|
206
208
|
if (!(x)) { \
|
@@ -209,6 +211,30 @@
|
|
209
211
|
} \
|
210
212
|
} while (0)
|
211
213
|
|
214
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
215
|
+
// main purpose is to reduce code duplication and improve readability.
|
216
|
+
//
|
217
|
+
// example:
|
218
|
+
//
|
219
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
220
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
221
|
+
//
|
222
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
223
|
+
const type prefix##0 = (pointer)->array[0]; \
|
224
|
+
GGML_UNUSED(prefix##0);
|
225
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
226
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
227
|
+
const type prefix##1 = (pointer)->array[1]; \
|
228
|
+
GGML_UNUSED(prefix##1);
|
229
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
230
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
231
|
+
const type prefix##2 = (pointer)->array[2]; \
|
232
|
+
GGML_UNUSED(prefix##2);
|
233
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
234
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
235
|
+
const type prefix##3 = (pointer)->array[3]; \
|
236
|
+
GGML_UNUSED(prefix##3);
|
237
|
+
|
212
238
|
#ifdef __cplusplus
|
213
239
|
extern "C" {
|
214
240
|
#endif
|
@@ -224,8 +250,8 @@ extern "C" {
|
|
224
250
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
251
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
252
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
253
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
254
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
255
|
|
230
256
|
struct ggml_object;
|
231
257
|
struct ggml_context;
|
@@ -295,12 +321,15 @@ extern "C" {
|
|
295
321
|
GGML_OP_SUM,
|
296
322
|
GGML_OP_SUM_ROWS,
|
297
323
|
GGML_OP_MEAN,
|
324
|
+
GGML_OP_ARGMAX,
|
298
325
|
GGML_OP_REPEAT,
|
299
326
|
GGML_OP_REPEAT_BACK,
|
300
327
|
GGML_OP_ABS,
|
301
328
|
GGML_OP_SGN,
|
302
329
|
GGML_OP_NEG,
|
303
330
|
GGML_OP_STEP,
|
331
|
+
GGML_OP_TANH,
|
332
|
+
GGML_OP_ELU,
|
304
333
|
GGML_OP_RELU,
|
305
334
|
GGML_OP_GELU,
|
306
335
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +361,8 @@ extern "C" {
|
|
332
361
|
GGML_OP_ROPE_BACK,
|
333
362
|
GGML_OP_ALIBI,
|
334
363
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
GGML_OP_CONV_2D_SK_P0,
|
364
|
+
GGML_OP_CONV_1D,
|
365
|
+
GGML_OP_CONV_2D,
|
338
366
|
|
339
367
|
GGML_OP_FLASH_ATTN,
|
340
368
|
GGML_OP_FLASH_FF,
|
@@ -444,6 +472,9 @@ extern "C" {
|
|
444
472
|
|
445
473
|
|
446
474
|
// compute types
|
475
|
+
|
476
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
477
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
478
|
enum ggml_task_type {
|
448
479
|
GGML_TASK_INIT = 0,
|
449
480
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +718,11 @@ extern "C" {
|
|
687
718
|
struct ggml_context * ctx,
|
688
719
|
struct ggml_tensor * a);
|
689
720
|
|
721
|
+
// argmax along rows
|
722
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
690
726
|
// if a is the same shape as b, and a is not parameter, return a
|
691
727
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
728
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +767,22 @@ extern "C" {
|
|
731
767
|
struct ggml_context * ctx,
|
732
768
|
struct ggml_tensor * a);
|
733
769
|
|
770
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
771
|
+
struct ggml_context * ctx,
|
772
|
+
struct ggml_tensor * a);
|
773
|
+
|
774
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
775
|
+
struct ggml_context * ctx,
|
776
|
+
struct ggml_tensor * a);
|
777
|
+
|
778
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
779
|
+
struct ggml_context * ctx,
|
780
|
+
struct ggml_tensor * a);
|
781
|
+
|
782
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
783
|
+
struct ggml_context * ctx,
|
784
|
+
struct ggml_tensor * a);
|
785
|
+
|
734
786
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
787
|
struct ggml_context * ctx,
|
736
788
|
struct ggml_tensor * a);
|
@@ -1081,58 +1133,33 @@ extern "C" {
|
|
1081
1133
|
float min,
|
1082
1134
|
float max);
|
1083
1135
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1136
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1137
|
struct ggml_context * ctx,
|
1115
1138
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1139
|
+
struct ggml_tensor * b,
|
1140
|
+
int s0, // stride
|
1141
|
+
int p0, // padding
|
1142
|
+
int d0); // dilation
|
1117
1143
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1144
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1145
|
struct ggml_context * ctx,
|
1121
1146
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1147
|
+
struct ggml_tensor * b,
|
1148
|
+
int s0,
|
1149
|
+
int s1,
|
1150
|
+
int p0,
|
1151
|
+
int p1,
|
1152
|
+
int d0,
|
1153
|
+
int d1);
|
1123
1154
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1155
|
+
// conv_1d with padding = half
|
1156
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1157
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1158
|
struct ggml_context * ctx,
|
1134
1159
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1160
|
+
struct ggml_tensor * b,
|
1161
|
+
int s,
|
1162
|
+
int d);
|
1136
1163
|
|
1137
1164
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1165
|
struct ggml_context * ctx,
|
@@ -1488,25 +1515,24 @@ extern "C" {
|
|
1488
1515
|
//
|
1489
1516
|
|
1490
1517
|
#ifdef __cplusplus
|
1491
|
-
|
1518
|
+
// restrict not standard in C++
|
1492
1519
|
#define GGML_RESTRICT
|
1493
1520
|
#else
|
1494
1521
|
#define GGML_RESTRICT restrict
|
1495
1522
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1523
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1524
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1525
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1526
|
|
1500
1527
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1528
|
+
ggml_to_float_t to_float;
|
1529
|
+
ggml_from_float_t from_float;
|
1530
|
+
ggml_from_float_t from_float_reference;
|
1531
|
+
ggml_vec_dot_t vec_dot;
|
1532
|
+
enum ggml_type vec_dot_type;
|
1533
|
+
} ggml_type_traits_t;
|
1534
|
+
|
1535
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1536
|
|
1511
1537
|
#ifdef __cplusplus
|
1512
1538
|
}
|