llama_cpp 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -201,6 +201,8 @@
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+ #define GGML_UNUSED(x) (void)(x)
205
+
204
206
  #define GGML_ASSERT(x) \
205
207
  do { \
206
208
  if (!(x)) { \
@@ -209,6 +211,30 @@
209
211
  } \
210
212
  } while (0)
211
213
 
214
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
215
+ // main purpose is to reduce code duplication and improve readability.
216
+ //
217
+ // example:
218
+ //
219
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
220
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
221
+ //
222
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
223
+ const type prefix##0 = (pointer)->array[0]; \
224
+ GGML_UNUSED(prefix##0);
225
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
226
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
227
+ const type prefix##1 = (pointer)->array[1]; \
228
+ GGML_UNUSED(prefix##1);
229
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
230
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
231
+ const type prefix##2 = (pointer)->array[2]; \
232
+ GGML_UNUSED(prefix##2);
233
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
234
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
235
+ const type prefix##3 = (pointer)->array[3]; \
236
+ GGML_UNUSED(prefix##3);
237
+
212
238
  #ifdef __cplusplus
213
239
  extern "C" {
214
240
  #endif
@@ -224,8 +250,8 @@ extern "C" {
224
250
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
251
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
252
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
253
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
254
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
255
 
230
256
  struct ggml_object;
231
257
  struct ggml_context;
@@ -295,12 +321,15 @@ extern "C" {
295
321
  GGML_OP_SUM,
296
322
  GGML_OP_SUM_ROWS,
297
323
  GGML_OP_MEAN,
324
+ GGML_OP_ARGMAX,
298
325
  GGML_OP_REPEAT,
299
326
  GGML_OP_REPEAT_BACK,
300
327
  GGML_OP_ABS,
301
328
  GGML_OP_SGN,
302
329
  GGML_OP_NEG,
303
330
  GGML_OP_STEP,
331
+ GGML_OP_TANH,
332
+ GGML_OP_ELU,
304
333
  GGML_OP_RELU,
305
334
  GGML_OP_GELU,
306
335
  GGML_OP_GELU_QUICK,
@@ -332,9 +361,8 @@ extern "C" {
332
361
  GGML_OP_ROPE_BACK,
333
362
  GGML_OP_ALIBI,
334
363
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
364
+ GGML_OP_CONV_1D,
365
+ GGML_OP_CONV_2D,
338
366
 
339
367
  GGML_OP_FLASH_ATTN,
340
368
  GGML_OP_FLASH_FF,
@@ -444,6 +472,9 @@ extern "C" {
444
472
 
445
473
 
446
474
  // compute types
475
+
476
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
477
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
478
  enum ggml_task_type {
448
479
  GGML_TASK_INIT = 0,
449
480
  GGML_TASK_COMPUTE,
@@ -687,6 +718,11 @@ extern "C" {
687
718
  struct ggml_context * ctx,
688
719
  struct ggml_tensor * a);
689
720
 
721
+ // argmax along rows
722
+ GGML_API struct ggml_tensor * ggml_argmax(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
690
726
  // if a is the same shape as b, and a is not parameter, return a
691
727
  // otherwise, return a new tensor: repeat(a) to fit in b
692
728
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +767,22 @@ extern "C" {
731
767
  struct ggml_context * ctx,
732
768
  struct ggml_tensor * a);
733
769
 
770
+ GGML_API struct ggml_tensor * ggml_tanh(
771
+ struct ggml_context * ctx,
772
+ struct ggml_tensor * a);
773
+
774
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
775
+ struct ggml_context * ctx,
776
+ struct ggml_tensor * a);
777
+
778
+ GGML_API struct ggml_tensor * ggml_elu(
779
+ struct ggml_context * ctx,
780
+ struct ggml_tensor * a);
781
+
782
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
783
+ struct ggml_context * ctx,
784
+ struct ggml_tensor * a);
785
+
734
786
  GGML_API struct ggml_tensor * ggml_relu(
735
787
  struct ggml_context * ctx,
736
788
  struct ggml_tensor * a);
@@ -1081,58 +1133,33 @@ extern "C" {
1081
1133
  float min,
1082
1134
  float max);
1083
1135
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1136
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1137
  struct ggml_context * ctx,
1115
1138
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1139
+ struct ggml_tensor * b,
1140
+ int s0, // stride
1141
+ int p0, // padding
1142
+ int d0); // dilation
1117
1143
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1144
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1145
  struct ggml_context * ctx,
1121
1146
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1147
+ struct ggml_tensor * b,
1148
+ int s0,
1149
+ int s1,
1150
+ int p0,
1151
+ int p1,
1152
+ int d0,
1153
+ int d1);
1123
1154
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1155
+ // conv_1d with padding = half
1156
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1157
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1158
  struct ggml_context * ctx,
1134
1159
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1160
+ struct ggml_tensor * b,
1161
+ int s,
1162
+ int d);
1136
1163
 
1137
1164
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1165
  struct ggml_context * ctx,
@@ -1488,25 +1515,24 @@ extern "C" {
1488
1515
  //
1489
1516
 
1490
1517
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1518
+ // restrict not standard in C++
1492
1519
  #define GGML_RESTRICT
1493
1520
  #else
1494
1521
  #define GGML_RESTRICT restrict
1495
1522
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1523
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1524
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1525
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1526
 
1500
1527
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1528
+ ggml_to_float_t to_float;
1529
+ ggml_from_float_t from_float;
1530
+ ggml_from_float_t from_float_reference;
1531
+ ggml_vec_dot_t vec_dot;
1532
+ enum ggml_type vec_dot_type;
1533
+ } ggml_type_traits_t;
1534
+
1535
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1536
 
1511
1537
  #ifdef __cplusplus
1512
1538
  }