llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,17 +11,6 @@ extern "C" {
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
13
  GGML_API void ggml_vk_instance_init(void);
14
- GGML_API void ggml_vk_init_cpu_assist(void);
15
-
16
- GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
17
- GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
18
- GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
19
- GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
20
- #ifdef GGML_VULKAN_CHECK_RESULTS
21
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
22
- #endif
23
- GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
24
- GGML_API void ggml_vk_free_cpu_assist(void);
25
14
 
26
15
  // backend API
27
16
  GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
+ #include "ggml.h"
6
7
 
7
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
8
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -43,6 +44,10 @@
43
44
 
44
45
  #if defined(_WIN32)
45
46
 
47
+ #define WIN32_LEAN_AND_MEAN
48
+ #ifndef NOMINMAX
49
+ #define NOMINMAX
50
+ #endif
46
51
  #include <windows.h>
47
52
 
48
53
  typedef volatile LONG atomic_int;
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
273
278
  #include <Accelerate/Accelerate.h>
274
279
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
275
280
  #include "ggml-opencl.h"
276
- #elif defined(GGML_USE_VULKAN)
277
- #include "ggml-vulkan.h"
278
281
  #endif
279
282
  #elif defined(GGML_USE_OPENBLAS)
280
283
  #if defined(GGML_BLAS_USE_MKL)
@@ -284,10 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
284
287
  #endif
285
288
  #elif defined(GGML_USE_CLBLAST)
286
289
  #include "ggml-opencl.h"
287
- #elif defined(GGML_USE_VULKAN)
288
- #include "ggml-vulkan.h"
289
- #elif defined(GGML_USE_SYCL)
290
- #include "ggml-sycl.h"
291
290
  #endif
292
291
 
293
292
  // floating point type used to accumulate sums
@@ -339,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
339
338
  return GGML_FP32_TO_FP16(x);
340
339
  }
341
340
 
342
- void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
343
- for (int i = 0; i < n; i++) {
341
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
342
+ for (int64_t i = 0; i < n; i++) {
344
343
  y[i] = GGML_FP16_TO_FP32(x[i]);
345
344
  }
346
345
  }
347
346
 
348
- void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
349
- int i = 0;
347
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
348
+ int64_t i = 0;
350
349
  #if defined(__F16C__)
351
350
  for (; i + 7 < n; i += 8) {
352
351
  __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -430,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
430
429
  #define ggml_perf_cycles_per_ms() 0
431
430
  #endif
432
431
 
432
+ //
433
+ // cross-platform UTF-8 file paths
434
+ //
435
+
436
+ #ifdef _WIN32
437
+ static wchar_t * ggml_mbstowcs(const char * mbs) {
438
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
439
+ if (!wlen) {
440
+ errno = EINVAL;
441
+ return NULL;
442
+ }
443
+
444
+ wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
445
+ wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
446
+ if (!wlen) {
447
+ GGML_FREE(wbuf);
448
+ errno = EINVAL;
449
+ return NULL;
450
+ }
451
+
452
+ return wbuf;
453
+ }
454
+ #endif
455
+
456
+ FILE * ggml_fopen(const char * fname, const char * mode) {
457
+ #ifdef _WIN32
458
+ FILE * file = NULL;
459
+
460
+ // convert fname (UTF-8)
461
+ wchar_t * wfname = ggml_mbstowcs(fname);
462
+ if (wfname) {
463
+ // convert mode (ANSI)
464
+ wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
465
+ wchar_t * wmode_p = wmode;
466
+ do {
467
+ *wmode_p++ = (wchar_t)*mode;
468
+ } while (*mode++);
469
+
470
+ // open file
471
+ file = _wfopen(wfname, wmode);
472
+
473
+ GGML_FREE(wfname);
474
+ GGML_FREE(wmode);
475
+ }
476
+
477
+ return file;
478
+ #else
479
+ return fopen(fname, mode);
480
+ #endif
481
+ }
482
+
433
483
  //
434
484
  // cache line
435
485
  //
@@ -740,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
740
790
  .vec_dot_type = GGML_TYPE_Q8_K,
741
791
  .nrows = 1,
742
792
  },
793
+ [GGML_TYPE_IQ1_M] = {
794
+ .type_name = "iq1_m",
795
+ .blck_size = QK_K,
796
+ .type_size = sizeof(block_iq1_m),
797
+ .is_quantized = true,
798
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
799
+ .from_float = NULL,
800
+ .from_float_reference = NULL,
801
+ .vec_dot = ggml_vec_dot_iq1_m_q8_K,
802
+ .vec_dot_type = GGML_TYPE_Q8_K,
803
+ .nrows = 1,
804
+ },
743
805
  [GGML_TYPE_IQ4_NL] = {
744
806
  .type_name = "iq4_nl",
745
807
  .blck_size = QK4_NL,
@@ -2485,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2485
2547
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2486
2548
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2487
2549
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2550
+ case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
2488
2551
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2489
2552
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2490
2553
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
@@ -2540,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
2540
2603
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2541
2604
  }
2542
2605
 
2606
+ GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
2607
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
2608
+ if (tensor->ne[i] == 0) {
2609
+ // empty if any dimension has no elements
2610
+ return true;
2611
+ }
2612
+ }
2613
+ return false;
2614
+ }
2615
+
2543
2616
  bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2544
2617
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2545
2618
 
@@ -2554,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2554
2627
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2555
2628
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2556
2629
 
2557
- return
2630
+ return ggml_is_empty(t0) ? ggml_is_empty(t1) :
2558
2631
  (t1->ne[0]%t0->ne[0] == 0) &&
2559
2632
  (t1->ne[1]%t0->ne[1] == 0) &&
2560
2633
  (t1->ne[2]%t0->ne[2] == 0) &&
@@ -2640,10 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2640
2713
 
2641
2714
  #if defined(GGML_USE_CLBLAST)
2642
2715
  ggml_cl_init();
2643
- #elif defined(GGML_USE_VULKAN)
2644
- ggml_vk_init_cpu_assist();
2645
- #elif defined(GGML_USE_SYCL)
2646
- ggml_init_sycl();
2647
2716
  #endif
2648
2717
 
2649
2718
  ggml_setup_op_has_task_pass();
@@ -2863,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2863
2932
  data_size *= ne[i];
2864
2933
  }
2865
2934
 
2866
- GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
2935
+ GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
2867
2936
 
2868
2937
  void * data = view_src != NULL ? view_src->data : NULL;
2869
2938
  if (data != NULL) {
@@ -4504,45 +4573,38 @@ void ggml_mul_mat_set_prec(
4504
4573
 
4505
4574
  // ggml_mul_mat_id
4506
4575
 
4576
+ // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
+ // this will allow computing all the used experts in a single matrix multiplication
4507
4578
  struct ggml_tensor * ggml_mul_mat_id(
4508
4579
  struct ggml_context * ctx,
4509
- struct ggml_tensor * const as[],
4510
- int n_as,
4580
+ struct ggml_tensor * as,
4511
4581
  struct ggml_tensor * ids,
4512
4582
  int id,
4513
4583
  struct ggml_tensor * b) {
4514
4584
 
4515
4585
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4516
- GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4517
- GGML_ASSERT(ids->ne[1] == b->ne[1]);
4586
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
+ GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4518
4588
  GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4519
- GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4520
- GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4589
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4590
+ GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4521
4591
 
4522
4592
  bool is_node = false;
4523
4593
 
4524
- if (as[0]->grad || b->grad) {
4594
+ if (as->grad || b->grad) {
4525
4595
  is_node = true;
4526
4596
  }
4527
4597
 
4528
- const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4598
+ const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4529
4599
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4530
4600
 
4531
4601
  ggml_set_op_params_i32(result, 0, id);
4532
- ggml_set_op_params_i32(result, 1, n_as);
4533
4602
 
4534
4603
  result->op = GGML_OP_MUL_MAT_ID;
4535
4604
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4536
- result->src[0] = ids;
4605
+ result->src[0] = as;
4537
4606
  result->src[1] = b;
4538
-
4539
- for (int i = 0; i < n_as; i++) {
4540
- struct ggml_tensor * a = as[i];
4541
- GGML_ASSERT(ggml_are_same_shape(as[0], a));
4542
- GGML_ASSERT(ggml_can_mul_mat(a, b));
4543
- GGML_ASSERT(!ggml_is_transposed(a));
4544
- result->src[i + 2] = a;
4545
- }
4607
+ result->src[2] = ids;
4546
4608
 
4547
4609
  return result;
4548
4610
  }
@@ -8083,6 +8145,7 @@ static void ggml_compute_forward_add(
8083
8145
  case GGML_TYPE_IQ2_XS:
8084
8146
  case GGML_TYPE_IQ3_XXS:
8085
8147
  case GGML_TYPE_IQ1_S:
8148
+ case GGML_TYPE_IQ1_M:
8086
8149
  case GGML_TYPE_IQ4_NL:
8087
8150
  case GGML_TYPE_IQ4_XS:
8088
8151
  case GGML_TYPE_IQ3_S:
@@ -8365,6 +8428,7 @@ static void ggml_compute_forward_add1(
8365
8428
  case GGML_TYPE_IQ2_XS:
8366
8429
  case GGML_TYPE_IQ3_XXS:
8367
8430
  case GGML_TYPE_IQ1_S:
8431
+ case GGML_TYPE_IQ1_M:
8368
8432
  case GGML_TYPE_IQ4_NL:
8369
8433
  case GGML_TYPE_IQ4_XS:
8370
8434
  case GGML_TYPE_IQ3_S:
@@ -8492,6 +8556,7 @@ static void ggml_compute_forward_acc(
8492
8556
  case GGML_TYPE_IQ2_XS:
8493
8557
  case GGML_TYPE_IQ3_XXS:
8494
8558
  case GGML_TYPE_IQ1_S:
8559
+ case GGML_TYPE_IQ1_M:
8495
8560
  case GGML_TYPE_IQ4_NL:
8496
8561
  case GGML_TYPE_IQ4_XS:
8497
8562
  case GGML_TYPE_IQ3_S:
@@ -10876,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
10876
10941
  const struct ggml_compute_params * params,
10877
10942
  struct ggml_tensor * dst) {
10878
10943
 
10879
- const struct ggml_tensor * ids = dst->src[0];
10944
+ const struct ggml_tensor * src0 = dst->src[0];
10880
10945
  const struct ggml_tensor * src1 = dst->src[1];
10881
-
10882
- const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10946
+ const struct ggml_tensor * ids = dst->src[2];
10883
10947
 
10884
10948
  GGML_TENSOR_BINARY_OP_LOCALS
10885
10949
 
@@ -10909,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
10909
10973
  GGML_ASSERT(nb1 <= nb2);
10910
10974
  GGML_ASSERT(nb2 <= nb3);
10911
10975
 
10912
- // broadcast factors
10913
- const int64_t r2 = ne12/ne02;
10914
- const int64_t r3 = ne13/ne03;
10976
+ // broadcast is not supported with mmid
10977
+ assert(ne12 == 1);
10978
+ assert(ne13 == 1);
10915
10979
 
10916
10980
  // row groups
10917
10981
  const int id = ggml_get_op_params_i32(dst, 0);
10918
- const int n_as = ggml_get_op_params_i32(dst, 1);
10982
+ const int n_as = src0->ne[2];
10919
10983
 
10920
10984
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10921
10985
  (char *) params->wdata :
@@ -10975,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
10975
11039
  continue;
10976
11040
  }
10977
11041
 
10978
- const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
11042
+ size_t src0_offset = cur_a*src0->nb[2];
10979
11043
 
10980
11044
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10981
11045
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11010,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
11010
11074
  continue;
11011
11075
  }
11012
11076
 
11013
- assert(ne12 % ne02 == 0);
11014
- assert(ne13 % ne03 == 0);
11015
-
11016
11077
  // block-tiling attempt
11017
11078
  const int64_t blck_0 = 16;
11018
11079
  const int64_t blck_1 = 16;
@@ -11029,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
11029
11090
  const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11030
11091
 
11031
11092
  // broadcast src0 into src1
11032
- const int64_t i03 = i13/r3;
11033
- const int64_t i02 = i12/r2;
11093
+ //const int64_t i03 = i13/r3;
11094
+ //const int64_t i02 = i12/r2;
11034
11095
 
11035
11096
  const int64_t i1 = i11;
11036
11097
  const int64_t i2 = i12;
11037
11098
  const int64_t i3 = i13;
11038
11099
 
11039
- const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
11100
+ const char * src0_row = (const char *) src0->data + src0_offset;
11040
11101
 
11041
11102
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11042
11103
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11395,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
11395
11456
  case GGML_TYPE_IQ2_XS:
11396
11457
  case GGML_TYPE_IQ3_XXS:
11397
11458
  case GGML_TYPE_IQ1_S:
11459
+ case GGML_TYPE_IQ1_M:
11398
11460
  case GGML_TYPE_IQ4_NL:
11399
11461
  case GGML_TYPE_IQ4_XS:
11400
11462
  case GGML_TYPE_IQ3_S:
@@ -11586,6 +11648,7 @@ static void ggml_compute_forward_set(
11586
11648
  case GGML_TYPE_IQ2_XS:
11587
11649
  case GGML_TYPE_IQ3_XXS:
11588
11650
  case GGML_TYPE_IQ1_S:
11651
+ case GGML_TYPE_IQ1_M:
11589
11652
  case GGML_TYPE_IQ4_NL:
11590
11653
  case GGML_TYPE_IQ4_XS:
11591
11654
  case GGML_TYPE_IQ3_S:
@@ -11809,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
11809
11872
  case GGML_TYPE_IQ2_XS:
11810
11873
  case GGML_TYPE_IQ3_XXS:
11811
11874
  case GGML_TYPE_IQ1_S:
11875
+ case GGML_TYPE_IQ1_M:
11812
11876
  case GGML_TYPE_IQ4_NL:
11813
11877
  case GGML_TYPE_IQ4_XS:
11814
11878
  case GGML_TYPE_IQ3_S:
@@ -12512,6 +12576,7 @@ static void ggml_compute_forward_alibi(
12512
12576
  case GGML_TYPE_IQ2_XS:
12513
12577
  case GGML_TYPE_IQ3_XXS:
12514
12578
  case GGML_TYPE_IQ1_S:
12579
+ case GGML_TYPE_IQ1_M:
12515
12580
  case GGML_TYPE_IQ4_NL:
12516
12581
  case GGML_TYPE_IQ4_XS:
12517
12582
  case GGML_TYPE_IQ3_S:
@@ -12600,6 +12665,7 @@ static void ggml_compute_forward_clamp(
12600
12665
  case GGML_TYPE_IQ2_XS:
12601
12666
  case GGML_TYPE_IQ3_XXS:
12602
12667
  case GGML_TYPE_IQ1_S:
12668
+ case GGML_TYPE_IQ1_M:
12603
12669
  case GGML_TYPE_IQ4_NL:
12604
12670
  case GGML_TYPE_IQ4_XS:
12605
12671
  case GGML_TYPE_IQ3_S:
@@ -16041,30 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
16041
16107
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
16042
16108
  GGML_ASSERT(params);
16043
16109
 
16044
- if (tensor->op == GGML_OP_NONE) {
16045
- return;
16046
- }
16047
-
16048
- #if defined(GGML_USE_VULKAN)
16049
- const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
16050
- #ifdef GGML_VULKAN_CHECK_RESULTS
16051
- if (skip_cpu) {
16052
- ggml_vk_check_results_1_cpu_assist(params, tensor);
16053
- }
16054
- #endif
16055
- if (skip_cpu) {
16110
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
16056
16111
  return;
16057
16112
  }
16058
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
16059
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
16060
- #endif // GGML_USE_VULKAN
16061
16113
 
16062
- #ifdef GGML_USE_SYCL
16063
- bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
16064
- if (skip_cpu) {
16065
- return;
16066
- }
16067
- #endif // GGML_USE_SYCL
16068
16114
  switch (tensor->op) {
16069
16115
  case GGML_OP_DUP:
16070
16116
  {
@@ -17916,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17916
17962
  static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17917
17963
  int n_tasks = 0;
17918
17964
 
17965
+ if (ggml_is_empty(node)) {
17966
+ // no need to multi-thread a no-op
17967
+ n_tasks = 1;
17968
+ return n_tasks;
17969
+ }
17970
+
17919
17971
  switch (node->op) {
17920
17972
  case GGML_OP_CPY:
17921
17973
  case GGML_OP_DUP:
@@ -18401,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18401
18453
  case GGML_OP_MUL_MAT_ID:
18402
18454
  {
18403
18455
  cur = 0;
18404
- const struct ggml_tensor * src0 = node->src[2];
18456
+ const struct ggml_tensor * src0 = node->src[0];
18405
18457
  const struct ggml_tensor * src1 = node->src[1];
18406
18458
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
18407
18459
  if (src1->type != vec_dot_type) {
18408
18460
  cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
18409
18461
  }
18410
- const int n_as = ggml_get_op_params_i32(node, 1);
18462
+ const int n_as = src0->ne[2];
18411
18463
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18412
18464
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18413
18465
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@@ -18534,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18534
18586
  }
18535
18587
  }
18536
18588
 
18537
- #ifdef GGML_USE_VULKAN
18538
- for (int i = 0; i < cgraph->n_nodes; i++) {
18539
- ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
18540
- }
18541
- ggml_vk_preallocate_buffers_cpu_assist();
18542
-
18543
- for (int i = 0; i < cgraph->n_nodes; i++) {
18544
- ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
18545
- }
18546
- #endif
18547
-
18548
18589
  const int n_threads = cplan->n_threads;
18549
18590
 
18550
18591
  struct ggml_compute_state_shared state_shared = {
@@ -18601,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
18601
18642
  }
18602
18643
  }
18603
18644
 
18604
- #ifdef GGML_USE_VULKAN
18605
- ggml_vk_graph_cleanup_cpu_assist();
18606
- #endif
18607
-
18608
18645
  // performance stats (graph)
18609
18646
  {
18610
18647
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18739,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
18739
18776
 
18740
18777
  // write binary data
18741
18778
  {
18742
- FILE * fout = fopen(fname, "wb");
18779
+ FILE * fout = ggml_fopen(fname, "wb");
18743
18780
 
18744
18781
  if (!fout) {
18745
18782
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@@ -18877,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
18877
18914
 
18878
18915
  // read file into data
18879
18916
  {
18880
- FILE * fin = fopen(fname, "rb");
18917
+ FILE * fin = ggml_fopen(fname, "rb");
18881
18918
  if (!fin) {
18882
18919
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
18883
18920
  return result;
@@ -19213,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
19213
19250
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
19214
19251
  char color[16];
19215
19252
 
19216
- FILE * fp = fopen(filename, "w");
19253
+ FILE * fp = ggml_fopen(filename, "w");
19217
19254
  GGML_ASSERT(fp);
19218
19255
 
19219
19256
  fprintf(fp, "digraph G {\n");
@@ -20260,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
20260
20297
  case GGML_TYPE_IQ2_XXS:
20261
20298
  case GGML_TYPE_IQ2_XS:
20262
20299
  case GGML_TYPE_IQ2_S:
20263
- case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
20300
+ case GGML_TYPE_IQ1_S:
20301
+ case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
20264
20302
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
20265
20303
  case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
20266
20304
  default: // nothing
@@ -20285,18 +20323,19 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20285
20323
  return
20286
20324
  type == GGML_TYPE_IQ2_XXS ||
20287
20325
  type == GGML_TYPE_IQ2_XS ||
20288
- type == GGML_TYPE_IQ1_S;
20326
+ type == GGML_TYPE_IQ1_S;// ||
20327
+ //type == GGML_TYPE_IQ1_M;
20289
20328
  }
20290
20329
 
20291
20330
  size_t ggml_quantize_chunk(
20292
20331
  enum ggml_type type,
20293
20332
  const float * src,
20294
20333
  void * dst,
20295
- int start,
20296
- int nrows,
20297
- int n_per_row,
20334
+ int64_t start,
20335
+ int64_t nrows,
20336
+ int64_t n_per_row,
20298
20337
  const float * imatrix) {
20299
- const int n = nrows * n_per_row;
20338
+ const int64_t n = (int64_t) nrows * n_per_row;
20300
20339
 
20301
20340
  if (ggml_quantize_requires_imatrix(type)) {
20302
20341
  GGML_ASSERT(imatrix != NULL);
@@ -20329,6 +20368,7 @@ size_t ggml_quantize_chunk(
20329
20368
  case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20330
20369
  case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20331
20370
  case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20371
+ case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20332
20372
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20333
20373
  #if QK_K == 64
20334
20374
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -20531,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
20531
20571
  }
20532
20572
 
20533
20573
  struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
20534
- FILE * file = fopen(fname, "rb");
20574
+ FILE * file = ggml_fopen(fname, "rb");
20535
20575
  if (!file) {
20536
20576
  return NULL;
20537
20577
  }
@@ -21486,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21486
21526
  }
21487
21527
 
21488
21528
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21489
- FILE * file = fopen(fname, "wb");
21529
+ FILE * file = ggml_fopen(fname, "wb");
21490
21530
  if (!file) {
21491
21531
  GGML_ASSERT(false && "failed to open file for writing");
21492
21532
  }
@@ -21628,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
21628
21668
  }
21629
21669
 
21630
21670
  int ggml_cpu_has_blas(void) {
21631
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21671
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
21632
21672
  return 1;
21633
21673
  #else
21634
21674
  return 0;
21635
21675
  #endif
21636
21676
  }
21637
21677
 
21638
- int ggml_cpu_has_cublas(void) {
21639
- #if defined(GGML_USE_CUBLAS)
21678
+ int ggml_cpu_has_cuda(void) {
21679
+ #if defined(GGML_USE_CUDA)
21640
21680
  return 1;
21641
21681
  #else
21642
21682
  return 0;
@@ -21676,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
21676
21716
  }
21677
21717
 
21678
21718
  int ggml_cpu_has_gpublas(void) {
21679
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21719
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
21680
21720
  ggml_cpu_has_sycl();
21681
21721
  }
21682
21722
 
@@ -214,9 +214,10 @@
214
214
  # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
215
215
  #endif
216
216
 
217
- #include <stdint.h>
218
- #include <stddef.h>
219
217
  #include <stdbool.h>
218
+ #include <stddef.h>
219
+ #include <stdint.h>
220
+ #include <stdio.h>
220
221
 
221
222
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
222
223
  #define GGML_FILE_VERSION 1
@@ -331,8 +332,8 @@ extern "C" {
331
332
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
332
333
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
333
334
 
334
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
335
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
335
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
336
337
 
337
338
  struct ggml_object;
338
339
  struct ggml_context;
@@ -368,6 +369,7 @@ extern "C" {
368
369
  GGML_TYPE_I32 = 26,
369
370
  GGML_TYPE_I64 = 27,
370
371
  GGML_TYPE_F64 = 28,
372
+ GGML_TYPE_IQ1_M = 29,
371
373
  GGML_TYPE_COUNT,
372
374
  };
373
375
 
@@ -407,6 +409,7 @@ extern "C" {
407
409
  GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
408
410
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
409
411
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
+ GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
410
413
  };
411
414
 
412
415
  // available tensor operations:
@@ -708,6 +711,9 @@ extern "C" {
708
711
 
709
712
  GGML_API void ggml_print_backtrace(void);
710
713
 
714
+ // accepts a UTF-8 path, even on Windows
715
+ GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
716
+
711
717
  GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
712
718
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
713
719
 
@@ -744,6 +750,7 @@ extern "C" {
744
750
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
745
751
  GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
746
752
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
753
+ GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
747
754
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
748
755
  GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
749
756
  GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
@@ -1157,8 +1164,7 @@ extern "C" {
1157
1164
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1158
1165
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1159
1166
  struct ggml_context * ctx,
1160
- struct ggml_tensor * const as[],
1161
- int n_as,
1167
+ struct ggml_tensor * as,
1162
1168
  struct ggml_tensor * ids,
1163
1169
  int id,
1164
1170
  struct ggml_tensor * b);
@@ -2204,9 +2210,9 @@ extern "C" {
2204
2210
  enum ggml_type type,
2205
2211
  const float * src,
2206
2212
  void * dst,
2207
- int start,
2208
- int nrows,
2209
- int n_per_row,
2213
+ int64_t start,
2214
+ int64_t nrows,
2215
+ int64_t n_per_row,
2210
2216
  const float * imatrix);
2211
2217
 
2212
2218
  //
@@ -2350,7 +2356,7 @@ extern "C" {
2350
2356
  GGML_API int ggml_cpu_has_fp16_va (void);
2351
2357
  GGML_API int ggml_cpu_has_wasm_simd (void);
2352
2358
  GGML_API int ggml_cpu_has_blas (void);
2353
- GGML_API int ggml_cpu_has_cublas (void);
2359
+ GGML_API int ggml_cpu_has_cuda (void);
2354
2360
  GGML_API int ggml_cpu_has_clblast (void);
2355
2361
  GGML_API int ggml_cpu_has_vulkan (void);
2356
2362
  GGML_API int ggml_cpu_has_kompute (void);
@@ -2371,8 +2377,8 @@ extern "C" {
2371
2377
  #else
2372
2378
  #define GGML_RESTRICT restrict
2373
2379
  #endif
2374
- typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2375
- typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2380
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2381
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2376
2382
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2377
2383
  const void * GGML_RESTRICT y, size_t by, int nrc);
2378
2384