llama_cpp 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,10 +8,6 @@ extern "C" {
8
8
 
9
9
  #define GGML_CUDA_MAX_DEVICES 16
10
10
 
11
- struct ggml_tensor_extra_gpu {
12
- void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
13
- };
14
-
15
11
  void ggml_init_cublas(void);
16
12
  void ggml_cuda_set_tensor_split(const float * tensor_split);
17
13
 
@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
202
202
 
203
203
  void ggml_metal_free(struct ggml_metal_context * ctx) {
204
204
  fprintf(stderr, "%s: deallocating\n", __func__);
205
-
205
+ for (int i = 0; i < ctx->n_buffers; ++i) {
206
+ [ctx->buffers[i].metal release];
207
+ }
206
208
  free(ctx);
207
209
  }
208
210
 
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
653
653
  const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
654
654
  const int in = tid - step*im; // 0...15 or 0...7
655
655
 
656
- #if K_QUANTS_PER_ITERATION == 1
656
+ \n#if K_QUANTS_PER_ITERATION == 1\n
657
657
  const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
658
658
  const int is = 0;
659
- #else
659
+
660
+ \n#else\n
661
+
660
662
  const int l0 = 4 * in; // 0, 4, 8, ..., 28
661
663
  const int is = in / 4;
662
- #endif
664
+
665
+ \n#endif\n
666
+
663
667
  const int ql_offset = 64*im + l0;
664
668
  const int qh_offset = 32*im + l0;
665
669
  const int s_offset = 8*im + is;
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
676
680
 
677
681
  const float d = vload_half(0, &x[i].d);
678
682
 
679
- #if K_QUANTS_PER_ITERATION == 1
683
+ \n#if K_QUANTS_PER_ITERATION == 1\n
680
684
  float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
681
685
  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
682
686
  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
686
690
  + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
687
691
  +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
688
692
  tmp[16 * ix + tid] += sum;
689
- #else
693
+ \n#else\n
690
694
  float sum = 0;
691
695
  for (int l = 0; l < 4; ++l) {
692
696
  sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
695
699
  + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
696
700
  }
697
701
  tmp[16 * ix + tid] += sum;
698
- #endif
702
+ \n#endif\n
699
703
 
700
704
  }
701
705
 
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1376
1380
  const int64_t ne00 = src0->ne[0];
1377
1381
  const int64_t ne01 = src0->ne[1];
1378
1382
  const int64_t ne02 = src0->ne[2];
1379
- const int64_t ne03 = src0->ne[2];
1383
+ const int64_t ne03 = src0->ne[3];
1380
1384
  const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1381
1385
  const int64_t ne10 = src1->ne[0];
1382
1386
  const int64_t ne11 = src1->ne[1];