llama_cpp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,6 +201,8 @@
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+ #define GGML_UNUSED(x) (void)(x)
205
+
204
206
  #define GGML_ASSERT(x) \
205
207
  do { \
206
208
  if (!(x)) { \
@@ -209,6 +211,30 @@
209
211
  } \
210
212
  } while (0)
211
213
 
214
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
215
+ // main purpose is to reduce code duplication and improve readability.
216
+ //
217
+ // example:
218
+ //
219
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
220
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
221
+ //
222
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
223
+ const type prefix##0 = (pointer)->array[0]; \
224
+ GGML_UNUSED(prefix##0);
225
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
226
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
227
+ const type prefix##1 = (pointer)->array[1]; \
228
+ GGML_UNUSED(prefix##1);
229
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
230
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
231
+ const type prefix##2 = (pointer)->array[2]; \
232
+ GGML_UNUSED(prefix##2);
233
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
234
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
235
+ const type prefix##3 = (pointer)->array[3]; \
236
+ GGML_UNUSED(prefix##3);
237
+
212
238
  #ifdef __cplusplus
213
239
  extern "C" {
214
240
  #endif
@@ -224,8 +250,8 @@ extern "C" {
224
250
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
251
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
252
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
253
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
254
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
255
 
230
256
  struct ggml_object;
231
257
  struct ggml_context;
@@ -295,12 +321,15 @@ extern "C" {
295
321
  GGML_OP_SUM,
296
322
  GGML_OP_SUM_ROWS,
297
323
  GGML_OP_MEAN,
324
+ GGML_OP_ARGMAX,
298
325
  GGML_OP_REPEAT,
299
326
  GGML_OP_REPEAT_BACK,
300
327
  GGML_OP_ABS,
301
328
  GGML_OP_SGN,
302
329
  GGML_OP_NEG,
303
330
  GGML_OP_STEP,
331
+ GGML_OP_TANH,
332
+ GGML_OP_ELU,
304
333
  GGML_OP_RELU,
305
334
  GGML_OP_GELU,
306
335
  GGML_OP_GELU_QUICK,
@@ -332,9 +361,8 @@ extern "C" {
332
361
  GGML_OP_ROPE_BACK,
333
362
  GGML_OP_ALIBI,
334
363
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
364
+ GGML_OP_CONV_1D,
365
+ GGML_OP_CONV_2D,
338
366
 
339
367
  GGML_OP_FLASH_ATTN,
340
368
  GGML_OP_FLASH_FF,
@@ -444,6 +472,9 @@ extern "C" {
444
472
 
445
473
 
446
474
  // compute types
475
+
476
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
477
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
478
  enum ggml_task_type {
448
479
  GGML_TASK_INIT = 0,
449
480
  GGML_TASK_COMPUTE,
@@ -687,6 +718,11 @@ extern "C" {
687
718
  struct ggml_context * ctx,
688
719
  struct ggml_tensor * a);
689
720
 
721
+ // argmax along rows
722
+ GGML_API struct ggml_tensor * ggml_argmax(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
690
726
  // if a is the same shape as b, and a is not parameter, return a
691
727
  // otherwise, return a new tensor: repeat(a) to fit in b
692
728
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +767,22 @@ extern "C" {
731
767
  struct ggml_context * ctx,
732
768
  struct ggml_tensor * a);
733
769
 
770
+ GGML_API struct ggml_tensor * ggml_tanh(
771
+ struct ggml_context * ctx,
772
+ struct ggml_tensor * a);
773
+
774
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
775
+ struct ggml_context * ctx,
776
+ struct ggml_tensor * a);
777
+
778
+ GGML_API struct ggml_tensor * ggml_elu(
779
+ struct ggml_context * ctx,
780
+ struct ggml_tensor * a);
781
+
782
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
783
+ struct ggml_context * ctx,
784
+ struct ggml_tensor * a);
785
+
734
786
  GGML_API struct ggml_tensor * ggml_relu(
735
787
  struct ggml_context * ctx,
736
788
  struct ggml_tensor * a);
@@ -1081,58 +1133,33 @@ extern "C" {
1081
1133
  float min,
1082
1134
  float max);
1083
1135
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1136
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1137
  struct ggml_context * ctx,
1115
1138
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1139
+ struct ggml_tensor * b,
1140
+ int s0, // stride
1141
+ int p0, // padding
1142
+ int d0); // dilation
1117
1143
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1144
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1145
  struct ggml_context * ctx,
1121
1146
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1147
+ struct ggml_tensor * b,
1148
+ int s0,
1149
+ int s1,
1150
+ int p0,
1151
+ int p1,
1152
+ int d0,
1153
+ int d1);
1123
1154
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1155
+ // conv_1d with padding = half
1156
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1157
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1158
  struct ggml_context * ctx,
1134
1159
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1160
+ struct ggml_tensor * b,
1161
+ int s,
1162
+ int d);
1136
1163
 
1137
1164
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1165
  struct ggml_context * ctx,
@@ -1488,25 +1515,24 @@ extern "C" {
1488
1515
  //
1489
1516
 
1490
1517
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1518
+ // restrict not standard in C++
1492
1519
  #define GGML_RESTRICT
1493
1520
  #else
1494
1521
  #define GGML_RESTRICT restrict
1495
1522
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1523
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1524
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1525
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1526
 
1500
1527
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1528
+ ggml_to_float_t to_float;
1529
+ ggml_from_float_t from_float;
1530
+ ggml_from_float_t from_float_reference;
1531
+ ggml_vec_dot_t vec_dot;
1532
+ enum ggml_type vec_dot_type;
1533
+ } ggml_type_traits_t;
1534
+
1535
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1536
 
1511
1537
  #ifdef __cplusplus
1512
1538
  }