llama_cpp 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -201,6 +201,8 @@
|
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
#define GGML_UNUSED(x) (void)(x)
|
205
|
+
|
204
206
|
#define GGML_ASSERT(x) \
|
205
207
|
do { \
|
206
208
|
if (!(x)) { \
|
@@ -209,6 +211,30 @@
|
|
209
211
|
} \
|
210
212
|
} while (0)
|
211
213
|
|
214
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
215
|
+
// main purpose is to reduce code duplication and improve readability.
|
216
|
+
//
|
217
|
+
// example:
|
218
|
+
//
|
219
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
220
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
221
|
+
//
|
222
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
223
|
+
const type prefix##0 = (pointer)->array[0]; \
|
224
|
+
GGML_UNUSED(prefix##0);
|
225
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
226
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
227
|
+
const type prefix##1 = (pointer)->array[1]; \
|
228
|
+
GGML_UNUSED(prefix##1);
|
229
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
230
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
231
|
+
const type prefix##2 = (pointer)->array[2]; \
|
232
|
+
GGML_UNUSED(prefix##2);
|
233
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
234
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
235
|
+
const type prefix##3 = (pointer)->array[3]; \
|
236
|
+
GGML_UNUSED(prefix##3);
|
237
|
+
|
212
238
|
#ifdef __cplusplus
|
213
239
|
extern "C" {
|
214
240
|
#endif
|
@@ -224,8 +250,8 @@ extern "C" {
|
|
224
250
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
251
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
252
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
253
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
254
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
255
|
|
230
256
|
struct ggml_object;
|
231
257
|
struct ggml_context;
|
@@ -295,12 +321,15 @@ extern "C" {
|
|
295
321
|
GGML_OP_SUM,
|
296
322
|
GGML_OP_SUM_ROWS,
|
297
323
|
GGML_OP_MEAN,
|
324
|
+
GGML_OP_ARGMAX,
|
298
325
|
GGML_OP_REPEAT,
|
299
326
|
GGML_OP_REPEAT_BACK,
|
300
327
|
GGML_OP_ABS,
|
301
328
|
GGML_OP_SGN,
|
302
329
|
GGML_OP_NEG,
|
303
330
|
GGML_OP_STEP,
|
331
|
+
GGML_OP_TANH,
|
332
|
+
GGML_OP_ELU,
|
304
333
|
GGML_OP_RELU,
|
305
334
|
GGML_OP_GELU,
|
306
335
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +361,8 @@ extern "C" {
|
|
332
361
|
GGML_OP_ROPE_BACK,
|
333
362
|
GGML_OP_ALIBI,
|
334
363
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
GGML_OP_CONV_2D_SK_P0,
|
364
|
+
GGML_OP_CONV_1D,
|
365
|
+
GGML_OP_CONV_2D,
|
338
366
|
|
339
367
|
GGML_OP_FLASH_ATTN,
|
340
368
|
GGML_OP_FLASH_FF,
|
@@ -444,6 +472,9 @@ extern "C" {
|
|
444
472
|
|
445
473
|
|
446
474
|
// compute types
|
475
|
+
|
476
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
477
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
478
|
enum ggml_task_type {
|
448
479
|
GGML_TASK_INIT = 0,
|
449
480
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +718,11 @@ extern "C" {
|
|
687
718
|
struct ggml_context * ctx,
|
688
719
|
struct ggml_tensor * a);
|
689
720
|
|
721
|
+
// argmax along rows
|
722
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
690
726
|
// if a is the same shape as b, and a is not parameter, return a
|
691
727
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
728
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +767,22 @@ extern "C" {
|
|
731
767
|
struct ggml_context * ctx,
|
732
768
|
struct ggml_tensor * a);
|
733
769
|
|
770
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
771
|
+
struct ggml_context * ctx,
|
772
|
+
struct ggml_tensor * a);
|
773
|
+
|
774
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
775
|
+
struct ggml_context * ctx,
|
776
|
+
struct ggml_tensor * a);
|
777
|
+
|
778
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
779
|
+
struct ggml_context * ctx,
|
780
|
+
struct ggml_tensor * a);
|
781
|
+
|
782
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
783
|
+
struct ggml_context * ctx,
|
784
|
+
struct ggml_tensor * a);
|
785
|
+
|
734
786
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
787
|
struct ggml_context * ctx,
|
736
788
|
struct ggml_tensor * a);
|
@@ -1081,58 +1133,33 @@ extern "C" {
|
|
1081
1133
|
float min,
|
1082
1134
|
float max);
|
1083
1135
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1136
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1137
|
struct ggml_context * ctx,
|
1115
1138
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1139
|
+
struct ggml_tensor * b,
|
1140
|
+
int s0, // stride
|
1141
|
+
int p0, // padding
|
1142
|
+
int d0); // dilation
|
1117
1143
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1144
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1145
|
struct ggml_context * ctx,
|
1121
1146
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1147
|
+
struct ggml_tensor * b,
|
1148
|
+
int s0,
|
1149
|
+
int s1,
|
1150
|
+
int p0,
|
1151
|
+
int p1,
|
1152
|
+
int d0,
|
1153
|
+
int d1);
|
1123
1154
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1155
|
+
// conv_1d with padding = half
|
1156
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1157
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1158
|
struct ggml_context * ctx,
|
1134
1159
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1160
|
+
struct ggml_tensor * b,
|
1161
|
+
int s,
|
1162
|
+
int d);
|
1136
1163
|
|
1137
1164
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1165
|
struct ggml_context * ctx,
|
@@ -1488,25 +1515,24 @@ extern "C" {
|
|
1488
1515
|
//
|
1489
1516
|
|
1490
1517
|
#ifdef __cplusplus
|
1491
|
-
|
1518
|
+
// restrict not standard in C++
|
1492
1519
|
#define GGML_RESTRICT
|
1493
1520
|
#else
|
1494
1521
|
#define GGML_RESTRICT restrict
|
1495
1522
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1523
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1524
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1525
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1526
|
|
1500
1527
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1528
|
+
ggml_to_float_t to_float;
|
1529
|
+
ggml_from_float_t from_float;
|
1530
|
+
ggml_from_float_t from_float_reference;
|
1531
|
+
ggml_vec_dot_t vec_dot;
|
1532
|
+
enum ggml_type vec_dot_type;
|
1533
|
+
} ggml_type_traits_t;
|
1534
|
+
|
1535
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1536
|
|
1511
1537
|
#ifdef __cplusplus
|
1512
1538
|
}
|