llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,17 +11,6 @@ extern "C" {
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
13
  GGML_API void ggml_vk_instance_init(void);
14
- GGML_API void ggml_vk_init_cpu_assist(void);
15
-
16
- GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
17
- GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
18
- GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
19
- GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
20
- #ifdef GGML_VULKAN_CHECK_RESULTS
21
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
22
- #endif
23
- GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
24
- GGML_API void ggml_vk_free_cpu_assist(void);
25
14
 
26
15
  // backend API
27
16
  GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
+ #include "ggml.h"
6
7
 
7
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
8
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -43,6 +44,10 @@
43
44
 
44
45
  #if defined(_WIN32)
45
46
 
47
+ #define WIN32_LEAN_AND_MEAN
48
+ #ifndef NOMINMAX
49
+ #define NOMINMAX
50
+ #endif
46
51
  #include <windows.h>
47
52
 
48
53
  typedef volatile LONG atomic_int;
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
273
278
  #include <Accelerate/Accelerate.h>
274
279
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
275
280
  #include "ggml-opencl.h"
276
- #elif defined(GGML_USE_VULKAN)
277
- #include "ggml-vulkan.h"
278
281
  #endif
279
282
  #elif defined(GGML_USE_OPENBLAS)
280
283
  #if defined(GGML_BLAS_USE_MKL)
@@ -282,14 +285,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
282
285
  #else
283
286
  #include <cblas.h>
284
287
  #endif
285
- #elif defined(GGML_USE_CUBLAS)
286
- #include "ggml-cuda.h"
287
288
  #elif defined(GGML_USE_CLBLAST)
288
289
  #include "ggml-opencl.h"
289
- #elif defined(GGML_USE_VULKAN)
290
- #include "ggml-vulkan.h"
291
- #elif defined(GGML_USE_SYCL)
292
- #include "ggml-sycl.h"
293
290
  #endif
294
291
 
295
292
  // floating point type used to accumulate sums
@@ -432,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
432
429
  #define ggml_perf_cycles_per_ms() 0
433
430
  #endif
434
431
 
432
+ //
433
+ // cross-platform UTF-8 file paths
434
+ //
435
+
436
+ #ifdef _WIN32
437
+ static wchar_t * ggml_mbstowcs(const char * mbs) {
438
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
439
+ if (!wlen) {
440
+ errno = EINVAL;
441
+ return NULL;
442
+ }
443
+
444
+ wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
445
+ wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
446
+ if (!wlen) {
447
+ GGML_FREE(wbuf);
448
+ errno = EINVAL;
449
+ return NULL;
450
+ }
451
+
452
+ return wbuf;
453
+ }
454
+ #endif
455
+
456
+ FILE * ggml_fopen(const char * fname, const char * mode) {
457
+ #ifdef _WIN32
458
+ FILE * file = NULL;
459
+
460
+ // convert fname (UTF-8)
461
+ wchar_t * wfname = ggml_mbstowcs(fname);
462
+ if (wfname) {
463
+ // convert mode (ANSI)
464
+ wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
465
+ wchar_t * wmode_p = wmode;
466
+ do {
467
+ *wmode_p++ = (wchar_t)*mode;
468
+ } while (*mode++);
469
+
470
+ // open file
471
+ file = _wfopen(wfname, wmode);
472
+
473
+ GGML_FREE(wfname);
474
+ GGML_FREE(wmode);
475
+ }
476
+
477
+ return file;
478
+ #else
479
+ return fopen(fname, mode);
480
+ #endif
481
+ }
482
+
435
483
  //
436
484
  // cache line
437
485
  //
@@ -742,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
742
790
  .vec_dot_type = GGML_TYPE_Q8_K,
743
791
  .nrows = 1,
744
792
  },
793
+ [GGML_TYPE_IQ1_M] = {
794
+ .type_name = "iq1_m",
795
+ .blck_size = QK_K,
796
+ .type_size = sizeof(block_iq1_m),
797
+ .is_quantized = true,
798
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
799
+ .from_float = NULL,
800
+ .from_float_reference = NULL,
801
+ .vec_dot = ggml_vec_dot_iq1_m_q8_K,
802
+ .vec_dot_type = GGML_TYPE_Q8_K,
803
+ .nrows = 1,
804
+ },
745
805
  [GGML_TYPE_IQ4_NL] = {
746
806
  .type_name = "iq4_nl",
747
807
  .blck_size = QK4_NL,
@@ -931,6 +991,101 @@ inline static float vaddvq_f32(float32x4_t v) {
931
991
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
932
992
  #endif
933
993
 
994
+ #elif defined(__AVX512F__)
995
+
996
+ #define GGML_SIMD
997
+
998
+ // F32 AVX512
999
+
1000
+ #define GGML_F32_STEP 64
1001
+ #define GGML_F32_EPR 16
1002
+
1003
+ #define GGML_F32x16 __m512
1004
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
1005
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
1006
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
1007
+ #define GGML_F32x16_STORE _mm512_storeu_ps
1008
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
1009
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
1010
+ #define GGML_F32x16_ADD _mm512_add_ps
1011
+ #define GGML_F32x16_MUL _mm512_mul_ps
1012
+ #define GGML_F32x16_REDUCE(res, x) \
1013
+ do { \
1014
+ int offset = GGML_F32_ARR >> 1; \
1015
+ for (int i = 0; i < offset; ++i) { \
1016
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1017
+ } \
1018
+ offset >>= 1; \
1019
+ for (int i = 0; i < offset; ++i) { \
1020
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1021
+ } \
1022
+ offset >>= 1; \
1023
+ for (int i = 0; i < offset; ++i) { \
1024
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1025
+ } \
1026
+ res = _mm512_reduce_add_ps(x[0]); \
1027
+ } while (0)
1028
+
1029
+ // TODO: is this optimal ?
1030
+
1031
+ #define GGML_F32_VEC GGML_F32x16
1032
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
1033
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
1034
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
1035
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
1036
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
1037
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
1038
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
1039
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
1040
+
1041
+ // F16 AVX512
1042
+
1043
+ // F16 AVX
1044
+
1045
+ #define GGML_F16_STEP 64
1046
+ #define GGML_F16_EPR 16
1047
+
1048
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
1049
+
1050
+ #define GGML_F32Cx16 __m512
1051
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
1052
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
1053
+
1054
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
1055
+ // so F16C guard isn't required
1056
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
1057
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
1058
+
1059
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
1060
+ #define GGML_F32Cx16_ADD _mm512_add_ps
1061
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
1062
+ #define GGML_F32Cx16_REDUCE(res, x) \
1063
+ do { \
1064
+ int offset = GGML_F32_ARR >> 1; \
1065
+ for (int i = 0; i < offset; ++i) { \
1066
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1067
+ } \
1068
+ offset >>= 1; \
1069
+ for (int i = 0; i < offset; ++i) { \
1070
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1071
+ } \
1072
+ offset >>= 1; \
1073
+ for (int i = 0; i < offset; ++i) { \
1074
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1075
+ } \
1076
+ res = _mm512_reduce_add_ps(x[0]); \
1077
+ } while (0)
1078
+
1079
+ #define GGML_F16_VEC GGML_F32Cx16
1080
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
1081
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
1082
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
1083
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
1084
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
1085
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
1086
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
1087
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
1088
+
934
1089
  #elif defined(__AVX__)
935
1090
 
936
1091
  #define GGML_SIMD
@@ -2392,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2392
2547
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2393
2548
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2394
2549
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2550
+ case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
2395
2551
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2396
2552
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2397
2553
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
@@ -2447,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
2447
2603
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2448
2604
  }
2449
2605
 
2606
+ GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
2607
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
2608
+ if (tensor->ne[i] == 0) {
2609
+ // empty if any dimension has no elements
2610
+ return true;
2611
+ }
2612
+ }
2613
+ return false;
2614
+ }
2615
+
2450
2616
  bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2451
2617
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2452
2618
 
@@ -2461,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2461
2627
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2462
2628
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2463
2629
 
2464
- return
2630
+ return ggml_is_empty(t0) ? ggml_is_empty(t1) :
2465
2631
  (t1->ne[0]%t0->ne[0] == 0) &&
2466
2632
  (t1->ne[1]%t0->ne[1] == 0) &&
2467
2633
  (t1->ne[2]%t0->ne[2] == 0) &&
@@ -2545,14 +2711,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2545
2711
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
2546
2712
  }
2547
2713
 
2548
- #if defined(GGML_USE_CUBLAS)
2549
- ggml_init_cublas();
2550
- #elif defined(GGML_USE_CLBLAST)
2714
+ #if defined(GGML_USE_CLBLAST)
2551
2715
  ggml_cl_init();
2552
- #elif defined(GGML_USE_VULKAN)
2553
- ggml_vk_init_cpu_assist();
2554
- #elif defined(GGML_USE_SYCL)
2555
- ggml_init_sycl();
2556
2716
  #endif
2557
2717
 
2558
2718
  ggml_setup_op_has_task_pass();
@@ -2772,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2772
2932
  data_size *= ne[i];
2773
2933
  }
2774
2934
 
2775
- GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
2935
+ GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
2776
2936
 
2777
2937
  void * data = view_src != NULL ? view_src->data : NULL;
2778
2938
  if (data != NULL) {
@@ -4413,45 +4573,38 @@ void ggml_mul_mat_set_prec(
4413
4573
 
4414
4574
  // ggml_mul_mat_id
4415
4575
 
4576
+ // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
+ // this will allow computing all the used experts in a single matrix multiplication
4416
4578
  struct ggml_tensor * ggml_mul_mat_id(
4417
4579
  struct ggml_context * ctx,
4418
- struct ggml_tensor * const as[],
4419
- int n_as,
4580
+ struct ggml_tensor * as,
4420
4581
  struct ggml_tensor * ids,
4421
4582
  int id,
4422
4583
  struct ggml_tensor * b) {
4423
4584
 
4424
4585
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4425
- GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4426
- GGML_ASSERT(ids->ne[1] == b->ne[1]);
4586
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
+ GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4427
4588
  GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4428
- GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4429
- GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4589
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4590
+ GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4430
4591
 
4431
4592
  bool is_node = false;
4432
4593
 
4433
- if (as[0]->grad || b->grad) {
4594
+ if (as->grad || b->grad) {
4434
4595
  is_node = true;
4435
4596
  }
4436
4597
 
4437
- const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4598
+ const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4438
4599
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4439
4600
 
4440
4601
  ggml_set_op_params_i32(result, 0, id);
4441
- ggml_set_op_params_i32(result, 1, n_as);
4442
4602
 
4443
4603
  result->op = GGML_OP_MUL_MAT_ID;
4444
4604
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4445
- result->src[0] = ids;
4605
+ result->src[0] = as;
4446
4606
  result->src[1] = b;
4447
-
4448
- for (int i = 0; i < n_as; i++) {
4449
- struct ggml_tensor * a = as[i];
4450
- GGML_ASSERT(ggml_are_same_shape(as[0], a));
4451
- GGML_ASSERT(ggml_can_mul_mat(a, b));
4452
- GGML_ASSERT(!ggml_is_transposed(a));
4453
- result->src[i + 2] = a;
4454
- }
4607
+ result->src[2] = ids;
4455
4608
 
4456
4609
  return result;
4457
4610
  }
@@ -7992,6 +8145,7 @@ static void ggml_compute_forward_add(
7992
8145
  case GGML_TYPE_IQ2_XS:
7993
8146
  case GGML_TYPE_IQ3_XXS:
7994
8147
  case GGML_TYPE_IQ1_S:
8148
+ case GGML_TYPE_IQ1_M:
7995
8149
  case GGML_TYPE_IQ4_NL:
7996
8150
  case GGML_TYPE_IQ4_XS:
7997
8151
  case GGML_TYPE_IQ3_S:
@@ -8274,6 +8428,7 @@ static void ggml_compute_forward_add1(
8274
8428
  case GGML_TYPE_IQ2_XS:
8275
8429
  case GGML_TYPE_IQ3_XXS:
8276
8430
  case GGML_TYPE_IQ1_S:
8431
+ case GGML_TYPE_IQ1_M:
8277
8432
  case GGML_TYPE_IQ4_NL:
8278
8433
  case GGML_TYPE_IQ4_XS:
8279
8434
  case GGML_TYPE_IQ3_S:
@@ -8401,6 +8556,7 @@ static void ggml_compute_forward_acc(
8401
8556
  case GGML_TYPE_IQ2_XS:
8402
8557
  case GGML_TYPE_IQ3_XXS:
8403
8558
  case GGML_TYPE_IQ1_S:
8559
+ case GGML_TYPE_IQ1_M:
8404
8560
  case GGML_TYPE_IQ4_NL:
8405
8561
  case GGML_TYPE_IQ4_XS:
8406
8562
  case GGML_TYPE_IQ3_S:
@@ -10785,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
10785
10941
  const struct ggml_compute_params * params,
10786
10942
  struct ggml_tensor * dst) {
10787
10943
 
10788
- const struct ggml_tensor * ids = dst->src[0];
10944
+ const struct ggml_tensor * src0 = dst->src[0];
10789
10945
  const struct ggml_tensor * src1 = dst->src[1];
10790
-
10791
- const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10946
+ const struct ggml_tensor * ids = dst->src[2];
10792
10947
 
10793
10948
  GGML_TENSOR_BINARY_OP_LOCALS
10794
10949
 
@@ -10818,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
10818
10973
  GGML_ASSERT(nb1 <= nb2);
10819
10974
  GGML_ASSERT(nb2 <= nb3);
10820
10975
 
10821
- // broadcast factors
10822
- const int64_t r2 = ne12/ne02;
10823
- const int64_t r3 = ne13/ne03;
10976
+ // broadcast is not supported with mmid
10977
+ assert(ne12 == 1);
10978
+ assert(ne13 == 1);
10824
10979
 
10825
10980
  // row groups
10826
10981
  const int id = ggml_get_op_params_i32(dst, 0);
10827
- const int n_as = ggml_get_op_params_i32(dst, 1);
10982
+ const int n_as = src0->ne[2];
10828
10983
 
10829
10984
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10830
10985
  (char *) params->wdata :
@@ -10884,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
10884
11039
  continue;
10885
11040
  }
10886
11041
 
10887
- const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
11042
+ size_t src0_offset = cur_a*src0->nb[2];
10888
11043
 
10889
11044
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10890
11045
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10919,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
10919
11074
  continue;
10920
11075
  }
10921
11076
 
10922
- assert(ne12 % ne02 == 0);
10923
- assert(ne13 % ne03 == 0);
10924
-
10925
11077
  // block-tiling attempt
10926
11078
  const int64_t blck_0 = 16;
10927
11079
  const int64_t blck_1 = 16;
@@ -10938,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
10938
11090
  const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
10939
11091
 
10940
11092
  // broadcast src0 into src1
10941
- const int64_t i03 = i13/r3;
10942
- const int64_t i02 = i12/r2;
11093
+ //const int64_t i03 = i13/r3;
11094
+ //const int64_t i02 = i12/r2;
10943
11095
 
10944
11096
  const int64_t i1 = i11;
10945
11097
  const int64_t i2 = i12;
10946
11098
  const int64_t i3 = i13;
10947
11099
 
10948
- const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
11100
+ const char * src0_row = (const char *) src0->data + src0_offset;
10949
11101
 
10950
11102
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10951
11103
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11010,7 +11162,6 @@ static void ggml_compute_forward_out_prod_f32(
11010
11162
  // nb01 >= nb00 - src0 is not transposed
11011
11163
  // compute by src0 rows
11012
11164
 
11013
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11014
11165
  // TODO: #if defined(GGML_USE_CLBLAST)
11015
11166
 
11016
11167
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -11210,7 +11361,6 @@ static void ggml_compute_forward_out_prod_q_f32(
11210
11361
  // nb01 >= nb00 - src0 is not transposed
11211
11362
  // compute by src0 rows
11212
11363
 
11213
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11214
11364
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11215
11365
 
11216
11366
  if (params->type == GGML_TASK_TYPE_INIT) {
@@ -11306,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
11306
11456
  case GGML_TYPE_IQ2_XS:
11307
11457
  case GGML_TYPE_IQ3_XXS:
11308
11458
  case GGML_TYPE_IQ1_S:
11459
+ case GGML_TYPE_IQ1_M:
11309
11460
  case GGML_TYPE_IQ4_NL:
11310
11461
  case GGML_TYPE_IQ4_XS:
11311
11462
  case GGML_TYPE_IQ3_S:
@@ -11497,6 +11648,7 @@ static void ggml_compute_forward_set(
11497
11648
  case GGML_TYPE_IQ2_XS:
11498
11649
  case GGML_TYPE_IQ3_XXS:
11499
11650
  case GGML_TYPE_IQ1_S:
11651
+ case GGML_TYPE_IQ1_M:
11500
11652
  case GGML_TYPE_IQ4_NL:
11501
11653
  case GGML_TYPE_IQ4_XS:
11502
11654
  case GGML_TYPE_IQ3_S:
@@ -11720,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
11720
11872
  case GGML_TYPE_IQ2_XS:
11721
11873
  case GGML_TYPE_IQ3_XXS:
11722
11874
  case GGML_TYPE_IQ1_S:
11875
+ case GGML_TYPE_IQ1_M:
11723
11876
  case GGML_TYPE_IQ4_NL:
11724
11877
  case GGML_TYPE_IQ4_XS:
11725
11878
  case GGML_TYPE_IQ3_S:
@@ -12423,6 +12576,7 @@ static void ggml_compute_forward_alibi(
12423
12576
  case GGML_TYPE_IQ2_XS:
12424
12577
  case GGML_TYPE_IQ3_XXS:
12425
12578
  case GGML_TYPE_IQ1_S:
12579
+ case GGML_TYPE_IQ1_M:
12426
12580
  case GGML_TYPE_IQ4_NL:
12427
12581
  case GGML_TYPE_IQ4_XS:
12428
12582
  case GGML_TYPE_IQ3_S:
@@ -12511,6 +12665,7 @@ static void ggml_compute_forward_clamp(
12511
12665
  case GGML_TYPE_IQ2_XS:
12512
12666
  case GGML_TYPE_IQ3_XXS:
12513
12667
  case GGML_TYPE_IQ1_S:
12668
+ case GGML_TYPE_IQ1_M:
12514
12669
  case GGML_TYPE_IQ4_NL:
12515
12670
  case GGML_TYPE_IQ4_XS:
12516
12671
  case GGML_TYPE_IQ3_S:
@@ -15952,37 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
15952
16107
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
15953
16108
  GGML_ASSERT(params);
15954
16109
 
15955
- if (tensor->op == GGML_OP_NONE) {
15956
- return;
15957
- }
15958
-
15959
- #ifdef GGML_USE_CUBLAS
15960
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
15961
- if (skip_cpu) {
16110
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
15962
16111
  return;
15963
16112
  }
15964
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15965
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15966
- #elif defined(GGML_USE_VULKAN)
15967
- const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15968
- #ifdef GGML_VULKAN_CHECK_RESULTS
15969
- if (skip_cpu) {
15970
- ggml_vk_check_results_1_cpu_assist(params, tensor);
15971
- }
15972
- #endif
15973
- if (skip_cpu) {
15974
- return;
15975
- }
15976
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15977
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15978
- #endif // GGML_USE_CUBLAS
15979
16113
 
15980
- #ifdef GGML_USE_SYCL
15981
- bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
15982
- if (skip_cpu) {
15983
- return;
15984
- }
15985
- #endif // GGML_USE_SYCL
15986
16114
  switch (tensor->op) {
15987
16115
  case GGML_OP_DUP:
15988
16116
  {
@@ -17834,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17834
17962
  static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17835
17963
  int n_tasks = 0;
17836
17964
 
17965
+ if (ggml_is_empty(node)) {
17966
+ // no need to multi-thread a no-op
17967
+ n_tasks = 1;
17968
+ return n_tasks;
17969
+ }
17970
+
17837
17971
  switch (node->op) {
17838
17972
  case GGML_OP_CPY:
17839
17973
  case GGML_OP_DUP:
@@ -18319,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18319
18453
  case GGML_OP_MUL_MAT_ID:
18320
18454
  {
18321
18455
  cur = 0;
18322
- const struct ggml_tensor * src0 = node->src[2];
18456
+ const struct ggml_tensor * src0 = node->src[0];
18323
18457
  const struct ggml_tensor * src1 = node->src[1];
18324
18458
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
18325
18459
  if (src1->type != vec_dot_type) {
18326
18460
  cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
18327
18461
  }
18328
- const int n_as = ggml_get_op_params_i32(node, 1);
18462
+ const int n_as = src0->ne[2];
18329
18463
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18330
18464
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18331
18465
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@@ -18452,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18452
18586
  }
18453
18587
  }
18454
18588
 
18455
- #ifdef GGML_USE_VULKAN
18456
- for (int i = 0; i < cgraph->n_nodes; i++) {
18457
- ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
18458
- }
18459
- ggml_vk_preallocate_buffers_cpu_assist();
18460
-
18461
- for (int i = 0; i < cgraph->n_nodes; i++) {
18462
- ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
18463
- }
18464
- #endif
18465
-
18466
18589
  const int n_threads = cplan->n_threads;
18467
18590
 
18468
18591
  struct ggml_compute_state_shared state_shared = {
@@ -18519,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18519
18642
  }
18520
18643
  }
18521
18644
 
18522
- #ifdef GGML_USE_VULKAN
18523
- ggml_vk_graph_cleanup_cpu_assist();
18524
- #endif
18525
-
18526
18645
  // performance stats (graph)
18527
18646
  {
18528
18647
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18657,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
18657
18776
 
18658
18777
  // write binary data
18659
18778
  {
18660
- FILE * fout = fopen(fname, "wb");
18779
+ FILE * fout = ggml_fopen(fname, "wb");
18661
18780
 
18662
18781
  if (!fout) {
18663
18782
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@@ -18795,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
18795
18914
 
18796
18915
  // read file into data
18797
18916
  {
18798
- FILE * fin = fopen(fname, "rb");
18917
+ FILE * fin = ggml_fopen(fname, "rb");
18799
18918
  if (!fin) {
18800
18919
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
18801
18920
  return result;
@@ -19131,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
19131
19250
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
19132
19251
  char color[16];
19133
19252
 
19134
- FILE * fp = fopen(filename, "w");
19253
+ FILE * fp = ggml_fopen(filename, "w");
19135
19254
  GGML_ASSERT(fp);
19136
19255
 
19137
19256
  fprintf(fp, "digraph G {\n");
@@ -20178,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
20178
20297
  case GGML_TYPE_IQ2_XXS:
20179
20298
  case GGML_TYPE_IQ2_XS:
20180
20299
  case GGML_TYPE_IQ2_S:
20181
- case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
20300
+ case GGML_TYPE_IQ1_S:
20301
+ case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
20182
20302
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
20183
20303
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
20184
20304
  default: // nothing
@@ -20203,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20203
20323
  return
20204
20324
  type == GGML_TYPE_IQ2_XXS ||
20205
20325
  type == GGML_TYPE_IQ2_XS ||
20206
- type == GGML_TYPE_IQ1_S;
20326
+ type == GGML_TYPE_IQ1_S;// ||
20327
+ //type == GGML_TYPE_IQ1_M;
20207
20328
  }
20208
20329
 
20209
20330
  size_t ggml_quantize_chunk(
@@ -20247,6 +20368,7 @@ size_t ggml_quantize_chunk(
20247
20368
  case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20248
20369
  case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20249
20370
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20371
+ case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20250
20372
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20251
20373
  #if QK_K == 64
20252
20374
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -20449,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
20449
20571
  }
20450
20572
 
20451
20573
  struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
20452
- FILE * file = fopen(fname, "rb");
20574
+ FILE * file = ggml_fopen(fname, "rb");
20453
20575
  if (!file) {
20454
20576
  return NULL;
20455
20577
  }
@@ -21404,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21404
21526
  }
21405
21527
 
21406
21528
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21407
- FILE * file = fopen(fname, "wb");
21529
+ FILE * file = ggml_fopen(fname, "wb");
21408
21530
  if (!file) {
21409
21531
  GGML_ASSERT(false && "failed to open file for writing");
21410
21532
  }
@@ -21546,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
21546
21668
  }
21547
21669
 
21548
21670
  int ggml_cpu_has_blas(void) {
21549
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21671
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21550
21672
  return 1;
21551
21673
  #else
21552
21674
  return 0;
21553
21675
  #endif
21554
21676
  }
21555
21677
 
21556
- int ggml_cpu_has_cublas(void) {
21557
- #if defined(GGML_USE_CUBLAS)
21678
+ int ggml_cpu_has_cuda(void) {
21679
+ #if defined(GGML_USE_CUDA)
21558
21680
  return 1;
21559
21681
  #else
21560
21682
  return 0;
@@ -21594,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
21594
21716
  }
21595
21717
 
21596
21718
  int ggml_cpu_has_gpublas(void) {
21597
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21719
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21598
21720
  ggml_cpu_has_sycl();
21599
21721
  }
21600
21722