llama_cpp 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -231,8 +231,9 @@
231
231
  #define GGML_EXIT_SUCCESS 0
232
232
  #define GGML_EXIT_ABORTED 1
233
233
 
234
- #define GGUF_MAGIC 0x46554747 // "GGUF"
235
- #define GGUF_VERSION 2
234
+ #define GGUF_MAGIC "GGUF"
235
+
236
+ #define GGUF_VERSION 3
236
237
 
237
238
  #define GGUF_DEFAULT_ALIGNMENT 32
238
239
 
@@ -400,15 +401,16 @@ extern "C" {
400
401
  GGML_OP_ALIBI,
401
402
  GGML_OP_CLAMP,
402
403
  GGML_OP_CONV_1D,
403
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
404
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
405
410
  GGML_OP_CONV_TRANSPOSE_2D,
406
411
  GGML_OP_POOL_1D,
407
412
  GGML_OP_POOL_2D,
408
413
 
409
- GGML_OP_CONV_1D_STAGE_0, // internal
410
- GGML_OP_CONV_1D_STAGE_1, // internal
411
-
412
414
  GGML_OP_UPSCALE, // nearest interpolate
413
415
 
414
416
  GGML_OP_FLASH_ATTN,
@@ -1019,9 +1021,9 @@ extern "C" {
1019
1021
  struct ggml_tensor * b,
1020
1022
  float eps);
1021
1023
 
1022
- // A: n columns, m rows
1023
- // B: n columns, p rows (i.e. we transpose it internally)
1024
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1025
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1026
1028
  struct ggml_context * ctx,
1027
1029
  struct ggml_tensor * a,
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
46
46
  #if defined(_MSC_VER) || defined(__MINGW32__)
47
47
  #include <intrin.h>
48
48
  #else
49
- #if !defined(__riscv)
49
+ #if !defined(__riscv) && !defined(__s390__)
50
50
  #include <immintrin.h>
51
51
  #endif
52
52
  #endif
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
462
462
  }
463
463
 
464
464
  size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
465
- const int nb = k / QK_K;
466
-
467
- // TODO - collect histograms - although, at a second thought, I don't really care about them
468
- (void)hist;
465
+ (void)hist; // TODO: collect histograms
469
466
 
470
- for (int j = 0; j < nb; j += k) {
467
+ for (int j = 0; j < n; j += k) {
471
468
  block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
472
469
  quantize_row_q2_K_reference(src + j, y, k);
473
470
  }
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
678
675
  }
679
676
 
680
677
  size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
681
- const int nb = k / QK_K;
682
-
683
- // TODO - collect histograms - although, at a second thought, I don't really care about them
684
- (void)hist;
678
+ (void)hist; // TODO: collect histograms
685
679
 
686
- for (int j = 0; j < nb; j += k) {
680
+ for (int j = 0; j < n; j += k) {
687
681
  block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
688
682
  quantize_row_q3_K_reference(src + j, y, k);
689
683
  }
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
846
840
 
847
841
  size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
848
842
  assert(k % QK_K == 0);
849
- const int nb = k / QK_K;
850
843
  (void)hist; // TODO: collect histograms
851
- for (int j = 0; j < nb; j += k) {
844
+
845
+ for (int j = 0; j < n; j += k) {
852
846
  block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
853
847
  quantize_row_q4_K_reference(src + j, y, k);
854
848
  }
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
1052
1046
 
1053
1047
  size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1054
1048
  assert(k % QK_K == 0);
1055
- const int nb = k / QK_K;
1056
- (void)hist;
1057
- for (int j = 0; j < nb; j += k) {
1049
+ (void)hist; // TODO: collect histograms
1050
+
1051
+ for (int j = 0; j < n; j += k) {
1058
1052
  block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
1059
1053
  quantize_row_q5_K_reference(src + j, y, k);
1060
1054
  }
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
1200
1194
 
1201
1195
  size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
1202
1196
  assert(k % QK_K == 0);
1203
- const int nb = k / QK_K;
1204
-
1205
- (void)hist; // TODO
1197
+ (void)hist; // TODO: collect histograms
1206
1198
 
1207
- for (int j = 0; j < nb; j += k) {
1199
+ for (int j = 0; j < n; j += k) {
1208
1200
  block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
1209
1201
  quantize_row_q6_K_reference(src + j, y, k);
1210
1202
  }