llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,17 +11,6 @@ extern "C" {
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
13
  GGML_API void ggml_vk_instance_init(void);
14
- GGML_API void ggml_vk_init_cpu_assist(void);
15
-
16
- GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
17
- GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
18
- GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
19
- GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
20
- #ifdef GGML_VULKAN_CHECK_RESULTS
21
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
22
- #endif
23
- GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
24
- GGML_API void ggml_vk_free_cpu_assist(void);
25
14
 
26
15
  // backend API
27
16
  GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
+ #include "ggml.h"
6
7
 
7
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
8
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -43,6 +44,10 @@
43
44
 
44
45
  #if defined(_WIN32)
45
46
 
47
+ #define WIN32_LEAN_AND_MEAN
48
+ #ifndef NOMINMAX
49
+ #define NOMINMAX
50
+ #endif
46
51
  #include <windows.h>
47
52
 
48
53
  typedef volatile LONG atomic_int;
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
273
278
  #include <Accelerate/Accelerate.h>
274
279
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
275
280
  #include "ggml-opencl.h"
276
- #elif defined(GGML_USE_VULKAN)
277
- #include "ggml-vulkan.h"
278
281
  #endif
279
282
  #elif defined(GGML_USE_OPENBLAS)
280
283
  #if defined(GGML_BLAS_USE_MKL)
@@ -282,14 +285,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
282
285
  #else
283
286
  #include <cblas.h>
284
287
  #endif
285
- #elif defined(GGML_USE_CUBLAS)
286
- #include "ggml-cuda.h"
287
288
  #elif defined(GGML_USE_CLBLAST)
288
289
  #include "ggml-opencl.h"
289
- #elif defined(GGML_USE_VULKAN)
290
- #include "ggml-vulkan.h"
291
- #elif defined(GGML_USE_SYCL)
292
- #include "ggml-sycl.h"
293
290
  #endif
294
291
 
295
292
  // floating point type used to accumulate sums
@@ -432,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
432
429
  #define ggml_perf_cycles_per_ms() 0
433
430
  #endif
434
431
 
432
+ //
433
+ // cross-platform UTF-8 file paths
434
+ //
435
+
436
+ #ifdef _WIN32
437
+ static wchar_t * ggml_mbstowcs(const char * mbs) {
438
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
439
+ if (!wlen) {
440
+ errno = EINVAL;
441
+ return NULL;
442
+ }
443
+
444
+ wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
445
+ wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
446
+ if (!wlen) {
447
+ GGML_FREE(wbuf);
448
+ errno = EINVAL;
449
+ return NULL;
450
+ }
451
+
452
+ return wbuf;
453
+ }
454
+ #endif
455
+
456
+ FILE * ggml_fopen(const char * fname, const char * mode) {
457
+ #ifdef _WIN32
458
+ FILE * file = NULL;
459
+
460
+ // convert fname (UTF-8)
461
+ wchar_t * wfname = ggml_mbstowcs(fname);
462
+ if (wfname) {
463
+ // convert mode (ANSI)
464
+ wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
465
+ wchar_t * wmode_p = wmode;
466
+ do {
467
+ *wmode_p++ = (wchar_t)*mode;
468
+ } while (*mode++);
469
+
470
+ // open file
471
+ file = _wfopen(wfname, wmode);
472
+
473
+ GGML_FREE(wfname);
474
+ GGML_FREE(wmode);
475
+ }
476
+
477
+ return file;
478
+ #else
479
+ return fopen(fname, mode);
480
+ #endif
481
+ }
482
+
435
483
  //
436
484
  // cache line
437
485
  //
@@ -742,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
742
790
  .vec_dot_type = GGML_TYPE_Q8_K,
743
791
  .nrows = 1,
744
792
  },
793
+ [GGML_TYPE_IQ1_M] = {
794
+ .type_name = "iq1_m",
795
+ .blck_size = QK_K,
796
+ .type_size = sizeof(block_iq1_m),
797
+ .is_quantized = true,
798
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
799
+ .from_float = NULL,
800
+ .from_float_reference = NULL,
801
+ .vec_dot = ggml_vec_dot_iq1_m_q8_K,
802
+ .vec_dot_type = GGML_TYPE_Q8_K,
803
+ .nrows = 1,
804
+ },
745
805
  [GGML_TYPE_IQ4_NL] = {
746
806
  .type_name = "iq4_nl",
747
807
  .blck_size = QK4_NL,
@@ -931,6 +991,101 @@ inline static float vaddvq_f32(float32x4_t v) {
931
991
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
932
992
  #endif
933
993
 
994
+ #elif defined(__AVX512F__)
995
+
996
+ #define GGML_SIMD
997
+
998
+ // F32 AVX512
999
+
1000
+ #define GGML_F32_STEP 64
1001
+ #define GGML_F32_EPR 16
1002
+
1003
+ #define GGML_F32x16 __m512
1004
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
1005
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
1006
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
1007
+ #define GGML_F32x16_STORE _mm512_storeu_ps
1008
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
1009
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
1010
+ #define GGML_F32x16_ADD _mm512_add_ps
1011
+ #define GGML_F32x16_MUL _mm512_mul_ps
1012
+ #define GGML_F32x16_REDUCE(res, x) \
1013
+ do { \
1014
+ int offset = GGML_F32_ARR >> 1; \
1015
+ for (int i = 0; i < offset; ++i) { \
1016
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1017
+ } \
1018
+ offset >>= 1; \
1019
+ for (int i = 0; i < offset; ++i) { \
1020
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1021
+ } \
1022
+ offset >>= 1; \
1023
+ for (int i = 0; i < offset; ++i) { \
1024
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1025
+ } \
1026
+ res = _mm512_reduce_add_ps(x[0]); \
1027
+ } while (0)
1028
+
1029
+ // TODO: is this optimal ?
1030
+
1031
+ #define GGML_F32_VEC GGML_F32x16
1032
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
1033
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
1034
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
1035
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
1036
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
1037
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
1038
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
1039
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
1040
+
1041
+ // F16 AVX512
1042
+
1043
+ // F16 AVX
1044
+
1045
+ #define GGML_F16_STEP 64
1046
+ #define GGML_F16_EPR 16
1047
+
1048
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
1049
+
1050
+ #define GGML_F32Cx16 __m512
1051
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
1052
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
1053
+
1054
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
1055
+ // so F16C guard isn't required
1056
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
1057
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
1058
+
1059
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
1060
+ #define GGML_F32Cx16_ADD _mm512_add_ps
1061
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
1062
+ #define GGML_F32Cx16_REDUCE(res, x) \
1063
+ do { \
1064
+ int offset = GGML_F32_ARR >> 1; \
1065
+ for (int i = 0; i < offset; ++i) { \
1066
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1067
+ } \
1068
+ offset >>= 1; \
1069
+ for (int i = 0; i < offset; ++i) { \
1070
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1071
+ } \
1072
+ offset >>= 1; \
1073
+ for (int i = 0; i < offset; ++i) { \
1074
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1075
+ } \
1076
+ res = _mm512_reduce_add_ps(x[0]); \
1077
+ } while (0)
1078
+
1079
+ #define GGML_F16_VEC GGML_F32Cx16
1080
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
1081
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
1082
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
1083
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
1084
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
1085
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
1086
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
1087
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
1088
+
934
1089
  #elif defined(__AVX__)
935
1090
 
936
1091
  #define GGML_SIMD
@@ -2392,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2392
2547
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2393
2548
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2394
2549
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2550
+ case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
2395
2551
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2396
2552
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2397
2553
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
@@ -2447,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
2447
2603
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2448
2604
  }
2449
2605
 
2606
+ GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
2607
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
2608
+ if (tensor->ne[i] == 0) {
2609
+ // empty if any dimension has no elements
2610
+ return true;
2611
+ }
2612
+ }
2613
+ return false;
2614
+ }
2615
+
2450
2616
  bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2451
2617
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2452
2618
 
@@ -2461,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2461
2627
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2462
2628
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2463
2629
 
2464
- return
2630
+ return ggml_is_empty(t0) ? ggml_is_empty(t1) :
2465
2631
  (t1->ne[0]%t0->ne[0] == 0) &&
2466
2632
  (t1->ne[1]%t0->ne[1] == 0) &&
2467
2633
  (t1->ne[2]%t0->ne[2] == 0) &&
@@ -2545,14 +2711,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2545
2711
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
2546
2712
  }
2547
2713
 
2548
- #if defined(GGML_USE_CUBLAS)
2549
- ggml_init_cublas();
2550
- #elif defined(GGML_USE_CLBLAST)
2714
+ #if defined(GGML_USE_CLBLAST)
2551
2715
  ggml_cl_init();
2552
- #elif defined(GGML_USE_VULKAN)
2553
- ggml_vk_init_cpu_assist();
2554
- #elif defined(GGML_USE_SYCL)
2555
- ggml_init_sycl();
2556
2716
  #endif
2557
2717
 
2558
2718
  ggml_setup_op_has_task_pass();
@@ -2772,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2772
2932
  data_size *= ne[i];
2773
2933
  }
2774
2934
 
2775
- GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
2935
+ GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
2776
2936
 
2777
2937
  void * data = view_src != NULL ? view_src->data : NULL;
2778
2938
  if (data != NULL) {
@@ -4413,45 +4573,38 @@ void ggml_mul_mat_set_prec(
4413
4573
 
4414
4574
  // ggml_mul_mat_id
4415
4575
 
4576
+ // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
+ // this will allow computing all the used experts in a single matrix multiplication
4416
4578
  struct ggml_tensor * ggml_mul_mat_id(
4417
4579
  struct ggml_context * ctx,
4418
- struct ggml_tensor * const as[],
4419
- int n_as,
4580
+ struct ggml_tensor * as,
4420
4581
  struct ggml_tensor * ids,
4421
4582
  int id,
4422
4583
  struct ggml_tensor * b) {
4423
4584
 
4424
4585
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4425
- GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4426
- GGML_ASSERT(ids->ne[1] == b->ne[1]);
4586
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
+ GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4427
4588
  GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4428
- GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4429
- GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4589
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4590
+ GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4430
4591
 
4431
4592
  bool is_node = false;
4432
4593
 
4433
- if (as[0]->grad || b->grad) {
4594
+ if (as->grad || b->grad) {
4434
4595
  is_node = true;
4435
4596
  }
4436
4597
 
4437
- const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4598
+ const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4438
4599
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4439
4600
 
4440
4601
  ggml_set_op_params_i32(result, 0, id);
4441
- ggml_set_op_params_i32(result, 1, n_as);
4442
4602
 
4443
4603
  result->op = GGML_OP_MUL_MAT_ID;
4444
4604
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4445
- result->src[0] = ids;
4605
+ result->src[0] = as;
4446
4606
  result->src[1] = b;
4447
-
4448
- for (int i = 0; i < n_as; i++) {
4449
- struct ggml_tensor * a = as[i];
4450
- GGML_ASSERT(ggml_are_same_shape(as[0], a));
4451
- GGML_ASSERT(ggml_can_mul_mat(a, b));
4452
- GGML_ASSERT(!ggml_is_transposed(a));
4453
- result->src[i + 2] = a;
4454
- }
4607
+ result->src[2] = ids;
4455
4608
 
4456
4609
  return result;
4457
4610
  }
@@ -7992,6 +8145,7 @@ static void ggml_compute_forward_add(
7992
8145
  case GGML_TYPE_IQ2_XS:
7993
8146
  case GGML_TYPE_IQ3_XXS:
7994
8147
  case GGML_TYPE_IQ1_S:
8148
+ case GGML_TYPE_IQ1_M:
7995
8149
  case GGML_TYPE_IQ4_NL:
7996
8150
  case GGML_TYPE_IQ4_XS:
7997
8151
  case GGML_TYPE_IQ3_S:
@@ -8274,6 +8428,7 @@ static void ggml_compute_forward_add1(
8274
8428
  case GGML_TYPE_IQ2_XS:
8275
8429
  case GGML_TYPE_IQ3_XXS:
8276
8430
  case GGML_TYPE_IQ1_S:
8431
+ case GGML_TYPE_IQ1_M:
8277
8432
  case GGML_TYPE_IQ4_NL:
8278
8433
  case GGML_TYPE_IQ4_XS:
8279
8434
  case GGML_TYPE_IQ3_S:
@@ -8401,6 +8556,7 @@ static void ggml_compute_forward_acc(
8401
8556
  case GGML_TYPE_IQ2_XS:
8402
8557
  case GGML_TYPE_IQ3_XXS:
8403
8558
  case GGML_TYPE_IQ1_S:
8559
+ case GGML_TYPE_IQ1_M:
8404
8560
  case GGML_TYPE_IQ4_NL:
8405
8561
  case GGML_TYPE_IQ4_XS:
8406
8562
  case GGML_TYPE_IQ3_S:
@@ -10785,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
10785
10941
  const struct ggml_compute_params * params,
10786
10942
  struct ggml_tensor * dst) {
10787
10943
 
10788
- const struct ggml_tensor * ids = dst->src[0];
10944
+ const struct ggml_tensor * src0 = dst->src[0];
10789
10945
  const struct ggml_tensor * src1 = dst->src[1];
10790
-
10791
- const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10946
+ const struct ggml_tensor * ids = dst->src[2];
10792
10947
 
10793
10948
  GGML_TENSOR_BINARY_OP_LOCALS
10794
10949
 
@@ -10818,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
10818
10973
  GGML_ASSERT(nb1 <= nb2);
10819
10974
  GGML_ASSERT(nb2 <= nb3);
10820
10975
 
10821
- // broadcast factors
10822
- const int64_t r2 = ne12/ne02;
10823
- const int64_t r3 = ne13/ne03;
10976
+ // broadcast is not supported with mmid
10977
+ assert(ne12 == 1);
10978
+ assert(ne13 == 1);
10824
10979
 
10825
10980
  // row groups
10826
10981
  const int id = ggml_get_op_params_i32(dst, 0);
10827
- const int n_as = ggml_get_op_params_i32(dst, 1);
10982
+ const int n_as = src0->ne[2];
10828
10983
 
10829
10984
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10830
10985
  (char *) params->wdata :
@@ -10884,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
10884
11039
  continue;
10885
11040
  }
10886
11041
 
10887
- const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
11042
+ size_t src0_offset = cur_a*src0->nb[2];
10888
11043
 
10889
11044
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10890
11045
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10919,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
10919
11074
  continue;
10920
11075
  }
10921
11076
 
10922
- assert(ne12 % ne02 == 0);
10923
- assert(ne13 % ne03 == 0);
10924
-
10925
11077
  // block-tiling attempt
10926
11078
  const int64_t blck_0 = 16;
10927
11079
  const int64_t blck_1 = 16;
@@ -10938,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
10938
11090
  const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
10939
11091
 
10940
11092
  // broadcast src0 into src1
10941
- const int64_t i03 = i13/r3;
10942
- const int64_t i02 = i12/r2;
11093
+ //const int64_t i03 = i13/r3;
11094
+ //const int64_t i02 = i12/r2;
10943
11095
 
10944
11096
  const int64_t i1 = i11;
10945
11097
  const int64_t i2 = i12;
10946
11098
  const int64_t i3 = i13;
10947
11099
 
10948
- const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
11100
+ const char * src0_row = (const char *) src0->data + src0_offset;
10949
11101
 
10950
11102
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10951
11103
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11010,7 +11162,6 @@ static void ggml_compute_forward_out_prod_f32(
11010
11162
  // nb01 >= nb00 - src0 is not transposed
11011
11163
  // compute by src0 rows
11012
11164
 
11013
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11014
11165
  // TODO: #if defined(GGML_USE_CLBLAST)
11015
11166
 
11016
11167
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -11210,7 +11361,6 @@ static void ggml_compute_forward_out_prod_q_f32(
11210
11361
  // nb01 >= nb00 - src0 is not transposed
11211
11362
  // compute by src0 rows
11212
11363
 
11213
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11214
11364
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11215
11365
 
11216
11366
  if (params->type == GGML_TASK_TYPE_INIT) {
@@ -11306,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
11306
11456
  case GGML_TYPE_IQ2_XS:
11307
11457
  case GGML_TYPE_IQ3_XXS:
11308
11458
  case GGML_TYPE_IQ1_S:
11459
+ case GGML_TYPE_IQ1_M:
11309
11460
  case GGML_TYPE_IQ4_NL:
11310
11461
  case GGML_TYPE_IQ4_XS:
11311
11462
  case GGML_TYPE_IQ3_S:
@@ -11497,6 +11648,7 @@ static void ggml_compute_forward_set(
11497
11648
  case GGML_TYPE_IQ2_XS:
11498
11649
  case GGML_TYPE_IQ3_XXS:
11499
11650
  case GGML_TYPE_IQ1_S:
11651
+ case GGML_TYPE_IQ1_M:
11500
11652
  case GGML_TYPE_IQ4_NL:
11501
11653
  case GGML_TYPE_IQ4_XS:
11502
11654
  case GGML_TYPE_IQ3_S:
@@ -11720,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
11720
11872
  case GGML_TYPE_IQ2_XS:
11721
11873
  case GGML_TYPE_IQ3_XXS:
11722
11874
  case GGML_TYPE_IQ1_S:
11875
+ case GGML_TYPE_IQ1_M:
11723
11876
  case GGML_TYPE_IQ4_NL:
11724
11877
  case GGML_TYPE_IQ4_XS:
11725
11878
  case GGML_TYPE_IQ3_S:
@@ -12423,6 +12576,7 @@ static void ggml_compute_forward_alibi(
12423
12576
  case GGML_TYPE_IQ2_XS:
12424
12577
  case GGML_TYPE_IQ3_XXS:
12425
12578
  case GGML_TYPE_IQ1_S:
12579
+ case GGML_TYPE_IQ1_M:
12426
12580
  case GGML_TYPE_IQ4_NL:
12427
12581
  case GGML_TYPE_IQ4_XS:
12428
12582
  case GGML_TYPE_IQ3_S:
@@ -12511,6 +12665,7 @@ static void ggml_compute_forward_clamp(
12511
12665
  case GGML_TYPE_IQ2_XS:
12512
12666
  case GGML_TYPE_IQ3_XXS:
12513
12667
  case GGML_TYPE_IQ1_S:
12668
+ case GGML_TYPE_IQ1_M:
12514
12669
  case GGML_TYPE_IQ4_NL:
12515
12670
  case GGML_TYPE_IQ4_XS:
12516
12671
  case GGML_TYPE_IQ3_S:
@@ -15952,37 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
15952
16107
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
15953
16108
  GGML_ASSERT(params);
15954
16109
 
15955
- if (tensor->op == GGML_OP_NONE) {
15956
- return;
15957
- }
15958
-
15959
- #ifdef GGML_USE_CUBLAS
15960
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
15961
- if (skip_cpu) {
16110
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
15962
16111
  return;
15963
16112
  }
15964
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15965
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15966
- #elif defined(GGML_USE_VULKAN)
15967
- const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15968
- #ifdef GGML_VULKAN_CHECK_RESULTS
15969
- if (skip_cpu) {
15970
- ggml_vk_check_results_1_cpu_assist(params, tensor);
15971
- }
15972
- #endif
15973
- if (skip_cpu) {
15974
- return;
15975
- }
15976
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15977
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15978
- #endif // GGML_USE_CUBLAS
15979
16113
 
15980
- #ifdef GGML_USE_SYCL
15981
- bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
15982
- if (skip_cpu) {
15983
- return;
15984
- }
15985
- #endif // GGML_USE_SYCL
15986
16114
  switch (tensor->op) {
15987
16115
  case GGML_OP_DUP:
15988
16116
  {
@@ -17834,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17834
17962
  static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17835
17963
  int n_tasks = 0;
17836
17964
 
17965
+ if (ggml_is_empty(node)) {
17966
+ // no need to multi-thread a no-op
17967
+ n_tasks = 1;
17968
+ return n_tasks;
17969
+ }
17970
+
17837
17971
  switch (node->op) {
17838
17972
  case GGML_OP_CPY:
17839
17973
  case GGML_OP_DUP:
@@ -18319,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18319
18453
  case GGML_OP_MUL_MAT_ID:
18320
18454
  {
18321
18455
  cur = 0;
18322
- const struct ggml_tensor * src0 = node->src[2];
18456
+ const struct ggml_tensor * src0 = node->src[0];
18323
18457
  const struct ggml_tensor * src1 = node->src[1];
18324
18458
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
18325
18459
  if (src1->type != vec_dot_type) {
18326
18460
  cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
18327
18461
  }
18328
- const int n_as = ggml_get_op_params_i32(node, 1);
18462
+ const int n_as = src0->ne[2];
18329
18463
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18330
18464
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18331
18465
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@@ -18452,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18452
18586
  }
18453
18587
  }
18454
18588
 
18455
- #ifdef GGML_USE_VULKAN
18456
- for (int i = 0; i < cgraph->n_nodes; i++) {
18457
- ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
18458
- }
18459
- ggml_vk_preallocate_buffers_cpu_assist();
18460
-
18461
- for (int i = 0; i < cgraph->n_nodes; i++) {
18462
- ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
18463
- }
18464
- #endif
18465
-
18466
18589
  const int n_threads = cplan->n_threads;
18467
18590
 
18468
18591
  struct ggml_compute_state_shared state_shared = {
@@ -18519,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18519
18642
  }
18520
18643
  }
18521
18644
 
18522
- #ifdef GGML_USE_VULKAN
18523
- ggml_vk_graph_cleanup_cpu_assist();
18524
- #endif
18525
-
18526
18645
  // performance stats (graph)
18527
18646
  {
18528
18647
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18657,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
18657
18776
 
18658
18777
  // write binary data
18659
18778
  {
18660
- FILE * fout = fopen(fname, "wb");
18779
+ FILE * fout = ggml_fopen(fname, "wb");
18661
18780
 
18662
18781
  if (!fout) {
18663
18782
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@@ -18795,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
18795
18914
 
18796
18915
  // read file into data
18797
18916
  {
18798
- FILE * fin = fopen(fname, "rb");
18917
+ FILE * fin = ggml_fopen(fname, "rb");
18799
18918
  if (!fin) {
18800
18919
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
18801
18920
  return result;
@@ -19131,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
19131
19250
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
19132
19251
  char color[16];
19133
19252
 
19134
- FILE * fp = fopen(filename, "w");
19253
+ FILE * fp = ggml_fopen(filename, "w");
19135
19254
  GGML_ASSERT(fp);
19136
19255
 
19137
19256
  fprintf(fp, "digraph G {\n");
@@ -20178,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
20178
20297
  case GGML_TYPE_IQ2_XXS:
20179
20298
  case GGML_TYPE_IQ2_XS:
20180
20299
  case GGML_TYPE_IQ2_S:
20181
- case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
20300
+ case GGML_TYPE_IQ1_S:
20301
+ case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
20182
20302
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
20183
20303
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
20184
20304
  default: // nothing
@@ -20203,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20203
20323
  return
20204
20324
  type == GGML_TYPE_IQ2_XXS ||
20205
20325
  type == GGML_TYPE_IQ2_XS ||
20206
- type == GGML_TYPE_IQ1_S;
20326
+ type == GGML_TYPE_IQ1_S;// ||
20327
+ //type == GGML_TYPE_IQ1_M;
20207
20328
  }
20208
20329
 
20209
20330
  size_t ggml_quantize_chunk(
@@ -20247,6 +20368,7 @@ size_t ggml_quantize_chunk(
20247
20368
  case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20248
20369
  case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20249
20370
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20371
+ case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20250
20372
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20251
20373
  #if QK_K == 64
20252
20374
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -20449,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
20449
20571
  }
20450
20572
 
20451
20573
  struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
20452
- FILE * file = fopen(fname, "rb");
20574
+ FILE * file = ggml_fopen(fname, "rb");
20453
20575
  if (!file) {
20454
20576
  return NULL;
20455
20577
  }
@@ -21404,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21404
21526
  }
21405
21527
 
21406
21528
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21407
- FILE * file = fopen(fname, "wb");
21529
+ FILE * file = ggml_fopen(fname, "wb");
21408
21530
  if (!file) {
21409
21531
  GGML_ASSERT(false && "failed to open file for writing");
21410
21532
  }
@@ -21546,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
21546
21668
  }
21547
21669
 
21548
21670
  int ggml_cpu_has_blas(void) {
21549
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21671
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21550
21672
  return 1;
21551
21673
  #else
21552
21674
  return 0;
21553
21675
  #endif
21554
21676
  }
21555
21677
 
21556
- int ggml_cpu_has_cublas(void) {
21557
- #if defined(GGML_USE_CUBLAS)
21678
+ int ggml_cpu_has_cuda(void) {
21679
+ #if defined(GGML_USE_CUDA)
21558
21680
  return 1;
21559
21681
  #else
21560
21682
  return 0;
@@ -21594,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
21594
21716
  }
21595
21717
 
21596
21718
  int ggml_cpu_has_gpublas(void) {
21597
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21719
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21598
21720
  ggml_cpu_has_sycl();
21599
21721
  }
21600
21722