llama_cpp 0.7.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -231,8 +231,9 @@
231
231
  #define GGML_EXIT_SUCCESS 0
232
232
  #define GGML_EXIT_ABORTED 1
233
233
 
234
- #define GGUF_MAGIC 0x46554747 // "GGUF"
235
- #define GGUF_VERSION 2
234
+ #define GGUF_MAGIC "GGUF"
235
+
236
+ #define GGUF_VERSION 3
236
237
 
237
238
  #define GGUF_DEFAULT_ALIGNMENT 32
238
239
 
@@ -400,15 +401,16 @@ extern "C" {
400
401
  GGML_OP_ALIBI,
401
402
  GGML_OP_CLAMP,
402
403
  GGML_OP_CONV_1D,
403
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
404
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
405
410
  GGML_OP_CONV_TRANSPOSE_2D,
406
411
  GGML_OP_POOL_1D,
407
412
  GGML_OP_POOL_2D,
408
413
 
409
- GGML_OP_CONV_1D_STAGE_0, // internal
410
- GGML_OP_CONV_1D_STAGE_1, // internal
411
-
412
414
  GGML_OP_UPSCALE, // nearest interpolate
413
415
 
414
416
  GGML_OP_FLASH_ATTN,
@@ -1019,9 +1021,9 @@ extern "C" {
1019
1021
  struct ggml_tensor * b,
1020
1022
  float eps);
1021
1023
 
1022
- // A: n columns, m rows
1023
- // B: n columns, p rows (i.e. we transpose it internally)
1024
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1025
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1026
1028
  struct ggml_context * ctx,
1027
1029
  struct ggml_tensor * a,
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
46
46
  #if defined(_MSC_VER) || defined(__MINGW32__)
47
47
  #include <intrin.h>
48
48
  #else
49
- #if !defined(__riscv)
49
+ #if !defined(__riscv) && !defined(__s390__)
50
50
  #include <immintrin.h>
51
51
  #endif
52
52
  #endif
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
462
462
  }
463
463
 
464
464
  size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
465
- const int nb = k / QK_K;
466
-
467
- // TODO - collect histograms - although, at a second thought, I don't really care about them
468
- (void)hist;
465
+ (void)hist; // TODO: collect histograms
469
466
 
470
- for (int j = 0; j < nb; j += k) {
467
+ for (int j = 0; j < n; j += k) {
471
468
  block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
472
469
  quantize_row_q2_K_reference(src + j, y, k);
473
470
  }
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
678
675
  }
679
676
 
680
677
  size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
681
- const int nb = k / QK_K;
682
-
683
- // TODO - collect histograms - although, at a second thought, I don't really care about them
684
- (void)hist;
678
+ (void)hist; // TODO: collect histograms
685
679
 
686
- for (int j = 0; j < nb; j += k) {
680
+ for (int j = 0; j < n; j += k) {
687
681
  block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
688
682
  quantize_row_q3_K_reference(src + j, y, k);
689
683
  }
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
846
840
 
847
841
  size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
848
842
  assert(k % QK_K == 0);
849
- const int nb = k / QK_K;
850
843
  (void)hist; // TODO: collect histograms
851
- for (int j = 0; j < nb; j += k) {
844
+
845
+ for (int j = 0; j < n; j += k) {
852
846
  block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
853
847
  quantize_row_q4_K_reference(src + j, y, k);
854
848
  }
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
1052
1046
 
1053
1047
  size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1054
1048
  assert(k % QK_K == 0);
1055
- const int nb = k / QK_K;
1056
- (void)hist;
1057
- for (int j = 0; j < nb; j += k) {
1049
+ (void)hist; // TODO: collect histograms
1050
+
1051
+ for (int j = 0; j < n; j += k) {
1058
1052
  block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
1059
1053
  quantize_row_q5_K_reference(src + j, y, k);
1060
1054
  }
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
1200
1194
 
1201
1195
  size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
1202
1196
  assert(k % QK_K == 0);
1203
- const int nb = k / QK_K;
1204
-
1205
- (void)hist; // TODO
1197
+ (void)hist; // TODO: collect histograms
1206
1198
 
1207
- for (int j = 0; j < nb; j += k) {
1199
+ for (int j = 0; j < n; j += k) {
1208
1200
  block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
1209
1201
  quantize_row_q6_K_reference(src + j, y, k);
1210
1202
  }