llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,17 +11,6 @@ extern "C" {
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
13
  GGML_API void ggml_vk_instance_init(void);
14
- GGML_API void ggml_vk_init_cpu_assist(void);
15
-
16
- GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
17
- GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
18
- GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
19
- GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
20
- #ifdef GGML_VULKAN_CHECK_RESULTS
21
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
22
- #endif
23
- GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
24
- GGML_API void ggml_vk_free_cpu_assist(void);
25
14
 
26
15
  // backend API
27
16
  GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
+ #include "ggml.h"
6
7
 
7
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
8
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -43,6 +44,10 @@
43
44
 
44
45
  #if defined(_WIN32)
45
46
 
47
+ #define WIN32_LEAN_AND_MEAN
48
+ #ifndef NOMINMAX
49
+ #define NOMINMAX
50
+ #endif
46
51
  #include <windows.h>
47
52
 
48
53
  typedef volatile LONG atomic_int;
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
273
278
  #include <Accelerate/Accelerate.h>
274
279
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
275
280
  #include "ggml-opencl.h"
276
- #elif defined(GGML_USE_VULKAN)
277
- #include "ggml-vulkan.h"
278
281
  #endif
279
282
  #elif defined(GGML_USE_OPENBLAS)
280
283
  #if defined(GGML_BLAS_USE_MKL)
@@ -284,10 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
284
287
  #endif
285
288
  #elif defined(GGML_USE_CLBLAST)
286
289
  #include "ggml-opencl.h"
287
- #elif defined(GGML_USE_VULKAN)
288
- #include "ggml-vulkan.h"
289
- #elif defined(GGML_USE_SYCL)
290
- #include "ggml-sycl.h"
291
290
  #endif
292
291
 
293
292
  // floating point type used to accumulate sums
@@ -430,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
430
429
  #define ggml_perf_cycles_per_ms() 0
431
430
  #endif
432
431
 
432
+ //
433
+ // cross-platform UTF-8 file paths
434
+ //
435
+
436
+ #ifdef _WIN32
437
+ static wchar_t * ggml_mbstowcs(const char * mbs) {
438
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
439
+ if (!wlen) {
440
+ errno = EINVAL;
441
+ return NULL;
442
+ }
443
+
444
+ wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
445
+ wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
446
+ if (!wlen) {
447
+ GGML_FREE(wbuf);
448
+ errno = EINVAL;
449
+ return NULL;
450
+ }
451
+
452
+ return wbuf;
453
+ }
454
+ #endif
455
+
456
+ FILE * ggml_fopen(const char * fname, const char * mode) {
457
+ #ifdef _WIN32
458
+ FILE * file = NULL;
459
+
460
+ // convert fname (UTF-8)
461
+ wchar_t * wfname = ggml_mbstowcs(fname);
462
+ if (wfname) {
463
+ // convert mode (ANSI)
464
+ wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
465
+ wchar_t * wmode_p = wmode;
466
+ do {
467
+ *wmode_p++ = (wchar_t)*mode;
468
+ } while (*mode++);
469
+
470
+ // open file
471
+ file = _wfopen(wfname, wmode);
472
+
473
+ GGML_FREE(wfname);
474
+ GGML_FREE(wmode);
475
+ }
476
+
477
+ return file;
478
+ #else
479
+ return fopen(fname, mode);
480
+ #endif
481
+ }
482
+
433
483
  //
434
484
  // cache line
435
485
  //
@@ -740,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
740
790
  .vec_dot_type = GGML_TYPE_Q8_K,
741
791
  .nrows = 1,
742
792
  },
793
+ [GGML_TYPE_IQ1_M] = {
794
+ .type_name = "iq1_m",
795
+ .blck_size = QK_K,
796
+ .type_size = sizeof(block_iq1_m),
797
+ .is_quantized = true,
798
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
799
+ .from_float = NULL,
800
+ .from_float_reference = NULL,
801
+ .vec_dot = ggml_vec_dot_iq1_m_q8_K,
802
+ .vec_dot_type = GGML_TYPE_Q8_K,
803
+ .nrows = 1,
804
+ },
743
805
  [GGML_TYPE_IQ4_NL] = {
744
806
  .type_name = "iq4_nl",
745
807
  .blck_size = QK4_NL,
@@ -2485,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2485
2547
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2486
2548
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2487
2549
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2550
+ case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
2488
2551
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2489
2552
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2490
2553
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
@@ -2540,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
2540
2603
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2541
2604
  }
2542
2605
 
2606
+ GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
2607
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
2608
+ if (tensor->ne[i] == 0) {
2609
+ // empty if any dimension has no elements
2610
+ return true;
2611
+ }
2612
+ }
2613
+ return false;
2614
+ }
2615
+
2543
2616
  bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2544
2617
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2545
2618
 
@@ -2554,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2554
2627
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2555
2628
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2556
2629
 
2557
- return
2630
+ return ggml_is_empty(t0) ? ggml_is_empty(t1) :
2558
2631
  (t1->ne[0]%t0->ne[0] == 0) &&
2559
2632
  (t1->ne[1]%t0->ne[1] == 0) &&
2560
2633
  (t1->ne[2]%t0->ne[2] == 0) &&
@@ -2640,10 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2640
2713
 
2641
2714
  #if defined(GGML_USE_CLBLAST)
2642
2715
  ggml_cl_init();
2643
- #elif defined(GGML_USE_VULKAN)
2644
- ggml_vk_init_cpu_assist();
2645
- #elif defined(GGML_USE_SYCL)
2646
- ggml_init_sycl();
2647
2716
  #endif
2648
2717
 
2649
2718
  ggml_setup_op_has_task_pass();
@@ -2863,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2863
2932
  data_size *= ne[i];
2864
2933
  }
2865
2934
 
2866
- GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
2935
+ GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
2867
2936
 
2868
2937
  void * data = view_src != NULL ? view_src->data : NULL;
2869
2938
  if (data != NULL) {
@@ -4504,45 +4573,38 @@ void ggml_mul_mat_set_prec(
4504
4573
 
4505
4574
  // ggml_mul_mat_id
4506
4575
 
4576
+ // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
+ // this will allow computing all the used experts in a single matrix multiplication
4507
4578
  struct ggml_tensor * ggml_mul_mat_id(
4508
4579
  struct ggml_context * ctx,
4509
- struct ggml_tensor * const as[],
4510
- int n_as,
4580
+ struct ggml_tensor * as,
4511
4581
  struct ggml_tensor * ids,
4512
4582
  int id,
4513
4583
  struct ggml_tensor * b) {
4514
4584
 
4515
4585
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4516
- GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4517
- GGML_ASSERT(ids->ne[1] == b->ne[1]);
4586
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
+ GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4518
4588
  GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4519
- GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4520
- GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4589
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4590
+ GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4521
4591
 
4522
4592
  bool is_node = false;
4523
4593
 
4524
- if (as[0]->grad || b->grad) {
4594
+ if (as->grad || b->grad) {
4525
4595
  is_node = true;
4526
4596
  }
4527
4597
 
4528
- const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4598
+ const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4529
4599
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4530
4600
 
4531
4601
  ggml_set_op_params_i32(result, 0, id);
4532
- ggml_set_op_params_i32(result, 1, n_as);
4533
4602
 
4534
4603
  result->op = GGML_OP_MUL_MAT_ID;
4535
4604
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4536
- result->src[0] = ids;
4605
+ result->src[0] = as;
4537
4606
  result->src[1] = b;
4538
-
4539
- for (int i = 0; i < n_as; i++) {
4540
- struct ggml_tensor * a = as[i];
4541
- GGML_ASSERT(ggml_are_same_shape(as[0], a));
4542
- GGML_ASSERT(ggml_can_mul_mat(a, b));
4543
- GGML_ASSERT(!ggml_is_transposed(a));
4544
- result->src[i + 2] = a;
4545
- }
4607
+ result->src[2] = ids;
4546
4608
 
4547
4609
  return result;
4548
4610
  }
@@ -8083,6 +8145,7 @@ static void ggml_compute_forward_add(
8083
8145
  case GGML_TYPE_IQ2_XS:
8084
8146
  case GGML_TYPE_IQ3_XXS:
8085
8147
  case GGML_TYPE_IQ1_S:
8148
+ case GGML_TYPE_IQ1_M:
8086
8149
  case GGML_TYPE_IQ4_NL:
8087
8150
  case GGML_TYPE_IQ4_XS:
8088
8151
  case GGML_TYPE_IQ3_S:
@@ -8365,6 +8428,7 @@ static void ggml_compute_forward_add1(
8365
8428
  case GGML_TYPE_IQ2_XS:
8366
8429
  case GGML_TYPE_IQ3_XXS:
8367
8430
  case GGML_TYPE_IQ1_S:
8431
+ case GGML_TYPE_IQ1_M:
8368
8432
  case GGML_TYPE_IQ4_NL:
8369
8433
  case GGML_TYPE_IQ4_XS:
8370
8434
  case GGML_TYPE_IQ3_S:
@@ -8492,6 +8556,7 @@ static void ggml_compute_forward_acc(
8492
8556
  case GGML_TYPE_IQ2_XS:
8493
8557
  case GGML_TYPE_IQ3_XXS:
8494
8558
  case GGML_TYPE_IQ1_S:
8559
+ case GGML_TYPE_IQ1_M:
8495
8560
  case GGML_TYPE_IQ4_NL:
8496
8561
  case GGML_TYPE_IQ4_XS:
8497
8562
  case GGML_TYPE_IQ3_S:
@@ -10876,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
10876
10941
  const struct ggml_compute_params * params,
10877
10942
  struct ggml_tensor * dst) {
10878
10943
 
10879
- const struct ggml_tensor * ids = dst->src[0];
10944
+ const struct ggml_tensor * src0 = dst->src[0];
10880
10945
  const struct ggml_tensor * src1 = dst->src[1];
10881
-
10882
- const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10946
+ const struct ggml_tensor * ids = dst->src[2];
10883
10947
 
10884
10948
  GGML_TENSOR_BINARY_OP_LOCALS
10885
10949
 
@@ -10909,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
10909
10973
  GGML_ASSERT(nb1 <= nb2);
10910
10974
  GGML_ASSERT(nb2 <= nb3);
10911
10975
 
10912
- // broadcast factors
10913
- const int64_t r2 = ne12/ne02;
10914
- const int64_t r3 = ne13/ne03;
10976
+ // broadcast is not supported with mmid
10977
+ assert(ne12 == 1);
10978
+ assert(ne13 == 1);
10915
10979
 
10916
10980
  // row groups
10917
10981
  const int id = ggml_get_op_params_i32(dst, 0);
10918
- const int n_as = ggml_get_op_params_i32(dst, 1);
10982
+ const int n_as = src0->ne[2];
10919
10983
 
10920
10984
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10921
10985
  (char *) params->wdata :
@@ -10975,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
10975
11039
  continue;
10976
11040
  }
10977
11041
 
10978
- const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
11042
+ size_t src0_offset = cur_a*src0->nb[2];
10979
11043
 
10980
11044
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10981
11045
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11010,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
11010
11074
  continue;
11011
11075
  }
11012
11076
 
11013
- assert(ne12 % ne02 == 0);
11014
- assert(ne13 % ne03 == 0);
11015
-
11016
11077
  // block-tiling attempt
11017
11078
  const int64_t blck_0 = 16;
11018
11079
  const int64_t blck_1 = 16;
@@ -11029,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
11029
11090
  const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11030
11091
 
11031
11092
  // broadcast src0 into src1
11032
- const int64_t i03 = i13/r3;
11033
- const int64_t i02 = i12/r2;
11093
+ //const int64_t i03 = i13/r3;
11094
+ //const int64_t i02 = i12/r2;
11034
11095
 
11035
11096
  const int64_t i1 = i11;
11036
11097
  const int64_t i2 = i12;
11037
11098
  const int64_t i3 = i13;
11038
11099
 
11039
- const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
11100
+ const char * src0_row = (const char *) src0->data + src0_offset;
11040
11101
 
11041
11102
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11042
11103
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11395,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
11395
11456
  case GGML_TYPE_IQ2_XS:
11396
11457
  case GGML_TYPE_IQ3_XXS:
11397
11458
  case GGML_TYPE_IQ1_S:
11459
+ case GGML_TYPE_IQ1_M:
11398
11460
  case GGML_TYPE_IQ4_NL:
11399
11461
  case GGML_TYPE_IQ4_XS:
11400
11462
  case GGML_TYPE_IQ3_S:
@@ -11586,6 +11648,7 @@ static void ggml_compute_forward_set(
11586
11648
  case GGML_TYPE_IQ2_XS:
11587
11649
  case GGML_TYPE_IQ3_XXS:
11588
11650
  case GGML_TYPE_IQ1_S:
11651
+ case GGML_TYPE_IQ1_M:
11589
11652
  case GGML_TYPE_IQ4_NL:
11590
11653
  case GGML_TYPE_IQ4_XS:
11591
11654
  case GGML_TYPE_IQ3_S:
@@ -11809,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
11809
11872
  case GGML_TYPE_IQ2_XS:
11810
11873
  case GGML_TYPE_IQ3_XXS:
11811
11874
  case GGML_TYPE_IQ1_S:
11875
+ case GGML_TYPE_IQ1_M:
11812
11876
  case GGML_TYPE_IQ4_NL:
11813
11877
  case GGML_TYPE_IQ4_XS:
11814
11878
  case GGML_TYPE_IQ3_S:
@@ -12512,6 +12576,7 @@ static void ggml_compute_forward_alibi(
12512
12576
  case GGML_TYPE_IQ2_XS:
12513
12577
  case GGML_TYPE_IQ3_XXS:
12514
12578
  case GGML_TYPE_IQ1_S:
12579
+ case GGML_TYPE_IQ1_M:
12515
12580
  case GGML_TYPE_IQ4_NL:
12516
12581
  case GGML_TYPE_IQ4_XS:
12517
12582
  case GGML_TYPE_IQ3_S:
@@ -12600,6 +12665,7 @@ static void ggml_compute_forward_clamp(
12600
12665
  case GGML_TYPE_IQ2_XS:
12601
12666
  case GGML_TYPE_IQ3_XXS:
12602
12667
  case GGML_TYPE_IQ1_S:
12668
+ case GGML_TYPE_IQ1_M:
12603
12669
  case GGML_TYPE_IQ4_NL:
12604
12670
  case GGML_TYPE_IQ4_XS:
12605
12671
  case GGML_TYPE_IQ3_S:
@@ -16041,30 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
16041
16107
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
16042
16108
  GGML_ASSERT(params);
16043
16109
 
16044
- if (tensor->op == GGML_OP_NONE) {
16045
- return;
16046
- }
16047
-
16048
- #if defined(GGML_USE_VULKAN)
16049
- const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
16050
- #ifdef GGML_VULKAN_CHECK_RESULTS
16051
- if (skip_cpu) {
16052
- ggml_vk_check_results_1_cpu_assist(params, tensor);
16053
- }
16054
- #endif
16055
- if (skip_cpu) {
16110
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
16056
16111
  return;
16057
16112
  }
16058
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
16059
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
16060
- #endif // GGML_USE_VULKAN
16061
16113
 
16062
- #ifdef GGML_USE_SYCL
16063
- bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
16064
- if (skip_cpu) {
16065
- return;
16066
- }
16067
- #endif // GGML_USE_SYCL
16068
16114
  switch (tensor->op) {
16069
16115
  case GGML_OP_DUP:
16070
16116
  {
@@ -17916,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17916
17962
  static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17917
17963
  int n_tasks = 0;
17918
17964
 
17965
+ if (ggml_is_empty(node)) {
17966
+ // no need to multi-thread a no-op
17967
+ n_tasks = 1;
17968
+ return n_tasks;
17969
+ }
17970
+
17919
17971
  switch (node->op) {
17920
17972
  case GGML_OP_CPY:
17921
17973
  case GGML_OP_DUP:
@@ -18401,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18401
18453
  case GGML_OP_MUL_MAT_ID:
18402
18454
  {
18403
18455
  cur = 0;
18404
- const struct ggml_tensor * src0 = node->src[2];
18456
+ const struct ggml_tensor * src0 = node->src[0];
18405
18457
  const struct ggml_tensor * src1 = node->src[1];
18406
18458
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
18407
18459
  if (src1->type != vec_dot_type) {
18408
18460
  cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
18409
18461
  }
18410
- const int n_as = ggml_get_op_params_i32(node, 1);
18462
+ const int n_as = src0->ne[2];
18411
18463
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18412
18464
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18413
18465
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@@ -18534,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18534
18586
  }
18535
18587
  }
18536
18588
 
18537
- #ifdef GGML_USE_VULKAN
18538
- for (int i = 0; i < cgraph->n_nodes; i++) {
18539
- ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
18540
- }
18541
- ggml_vk_preallocate_buffers_cpu_assist();
18542
-
18543
- for (int i = 0; i < cgraph->n_nodes; i++) {
18544
- ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
18545
- }
18546
- #endif
18547
-
18548
18589
  const int n_threads = cplan->n_threads;
18549
18590
 
18550
18591
  struct ggml_compute_state_shared state_shared = {
@@ -18601,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18601
18642
  }
18602
18643
  }
18603
18644
 
18604
- #ifdef GGML_USE_VULKAN
18605
- ggml_vk_graph_cleanup_cpu_assist();
18606
- #endif
18607
-
18608
18645
  // performance stats (graph)
18609
18646
  {
18610
18647
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18739,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
18739
18776
 
18740
18777
  // write binary data
18741
18778
  {
18742
- FILE * fout = fopen(fname, "wb");
18779
+ FILE * fout = ggml_fopen(fname, "wb");
18743
18780
 
18744
18781
  if (!fout) {
18745
18782
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@@ -18877,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
18877
18914
 
18878
18915
  // read file into data
18879
18916
  {
18880
- FILE * fin = fopen(fname, "rb");
18917
+ FILE * fin = ggml_fopen(fname, "rb");
18881
18918
  if (!fin) {
18882
18919
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
18883
18920
  return result;
@@ -19213,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
19213
19250
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
19214
19251
  char color[16];
19215
19252
 
19216
- FILE * fp = fopen(filename, "w");
19253
+ FILE * fp = ggml_fopen(filename, "w");
19217
19254
  GGML_ASSERT(fp);
19218
19255
 
19219
19256
  fprintf(fp, "digraph G {\n");
@@ -20260,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
20260
20297
  case GGML_TYPE_IQ2_XXS:
20261
20298
  case GGML_TYPE_IQ2_XS:
20262
20299
  case GGML_TYPE_IQ2_S:
20263
- case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
20300
+ case GGML_TYPE_IQ1_S:
20301
+ case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
20264
20302
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
20265
20303
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
20266
20304
  default: // nothing
@@ -20285,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20285
20323
  return
20286
20324
  type == GGML_TYPE_IQ2_XXS ||
20287
20325
  type == GGML_TYPE_IQ2_XS ||
20288
- type == GGML_TYPE_IQ1_S;
20326
+ type == GGML_TYPE_IQ1_S;// ||
20327
+ //type == GGML_TYPE_IQ1_M;
20289
20328
  }
20290
20329
 
20291
20330
  size_t ggml_quantize_chunk(
@@ -20329,6 +20368,7 @@ size_t ggml_quantize_chunk(
20329
20368
  case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20330
20369
  case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20331
20370
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20371
+ case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20332
20372
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20333
20373
  #if QK_K == 64
20334
20374
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -20531,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
20531
20571
  }
20532
20572
 
20533
20573
  struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
20534
- FILE * file = fopen(fname, "rb");
20574
+ FILE * file = ggml_fopen(fname, "rb");
20535
20575
  if (!file) {
20536
20576
  return NULL;
20537
20577
  }
@@ -21486,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21486
21526
  }
21487
21527
 
21488
21528
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21489
- FILE * file = fopen(fname, "wb");
21529
+ FILE * file = ggml_fopen(fname, "wb");
21490
21530
  if (!file) {
21491
21531
  GGML_ASSERT(false && "failed to open file for writing");
21492
21532
  }
@@ -21628,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
21628
21668
  }
21629
21669
 
21630
21670
  int ggml_cpu_has_blas(void) {
21631
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21671
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21632
21672
  return 1;
21633
21673
  #else
21634
21674
  return 0;
21635
21675
  #endif
21636
21676
  }
21637
21677
 
21638
- int ggml_cpu_has_cublas(void) {
21639
- #if defined(GGML_USE_CUBLAS)
21678
+ int ggml_cpu_has_cuda(void) {
21679
+ #if defined(GGML_USE_CUDA)
21640
21680
  return 1;
21641
21681
  #else
21642
21682
  return 0;
@@ -21676,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
21676
21716
  }
21677
21717
 
21678
21718
  int ggml_cpu_has_gpublas(void) {
21679
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21719
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21680
21720
  ggml_cpu_has_sycl();
21681
21721
  }
21682
21722
 
@@ -214,9 +214,10 @@
214
214
  # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
215
215
  #endif
216
216
 
217
- #include <stdint.h>
218
- #include <stddef.h>
219
217
  #include <stdbool.h>
218
+ #include <stddef.h>
219
+ #include <stdint.h>
220
+ #include <stdio.h>
220
221
 
221
222
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
222
223
  #define GGML_FILE_VERSION 1
@@ -368,6 +369,7 @@ extern "C" {
368
369
  GGML_TYPE_I32 = 26,
369
370
  GGML_TYPE_I64 = 27,
370
371
  GGML_TYPE_F64 = 28,
372
+ GGML_TYPE_IQ1_M = 29,
371
373
  GGML_TYPE_COUNT,
372
374
  };
373
375
 
@@ -407,6 +409,7 @@ extern "C" {
407
409
  GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
408
410
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
409
411
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
+ GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
410
413
  };
411
414
 
412
415
  // available tensor operations:
@@ -708,6 +711,9 @@ extern "C" {
708
711
 
709
712
  GGML_API void ggml_print_backtrace(void);
710
713
 
714
+ // accepts a UTF-8 path, even on Windows
715
+ GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
716
+
711
717
  GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
712
718
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
713
719
 
@@ -744,6 +750,7 @@ extern "C" {
744
750
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
745
751
  GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
746
752
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
753
+ GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
747
754
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
748
755
  GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
749
756
  GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
@@ -1157,8 +1164,7 @@ extern "C" {
1157
1164
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1158
1165
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1159
1166
  struct ggml_context * ctx,
1160
- struct ggml_tensor * const as[],
1161
- int n_as,
1167
+ struct ggml_tensor * as,
1162
1168
  struct ggml_tensor * ids,
1163
1169
  int id,
1164
1170
  struct ggml_tensor * b);
@@ -2350,7 +2356,7 @@ extern "C" {
2350
2356
  GGML_API int ggml_cpu_has_fp16_va (void);
2351
2357
  GGML_API int ggml_cpu_has_wasm_simd (void);
2352
2358
  GGML_API int ggml_cpu_has_blas (void);
2353
- GGML_API int ggml_cpu_has_cublas (void);
2359
+ GGML_API int ggml_cpu_has_cuda (void);
2354
2360
  GGML_API int ggml_cpu_has_clblast (void);
2355
2361
  GGML_API int ggml_cpu_has_vulkan (void);
2356
2362
  GGML_API int ggml_cpu_has_kompute (void);