llama_cpp 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,10 +8,6 @@ extern "C" {
8
8
 
9
9
  #define GGML_CUDA_MAX_DEVICES 16
10
10
 
11
- struct ggml_tensor_extra_gpu {
12
- void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
13
- };
14
-
15
11
  void ggml_init_cublas(void);
16
12
  void ggml_cuda_set_tensor_split(const float * tensor_split);
17
13
 
@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
202
202
 
203
203
  void ggml_metal_free(struct ggml_metal_context * ctx) {
204
204
  fprintf(stderr, "%s: deallocating\n", __func__);
205
-
205
+ for (int i = 0; i < ctx->n_buffers; ++i) {
206
+ [ctx->buffers[i].metal release];
207
+ }
206
208
  free(ctx);
207
209
  }
208
210
 
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
653
653
  const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
654
654
  const int in = tid - step*im; // 0...15 or 0...7
655
655
 
656
- #if K_QUANTS_PER_ITERATION == 1
656
+ \n#if K_QUANTS_PER_ITERATION == 1\n
657
657
  const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
658
658
  const int is = 0;
659
- #else
659
+
660
+ \n#else\n
661
+
660
662
  const int l0 = 4 * in; // 0, 4, 8, ..., 28
661
663
  const int is = in / 4;
662
- #endif
664
+
665
+ \n#endif\n
666
+
663
667
  const int ql_offset = 64*im + l0;
664
668
  const int qh_offset = 32*im + l0;
665
669
  const int s_offset = 8*im + is;
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
676
680
 
677
681
  const float d = vload_half(0, &x[i].d);
678
682
 
679
- #if K_QUANTS_PER_ITERATION == 1
683
+ \n#if K_QUANTS_PER_ITERATION == 1\n
680
684
  float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
681
685
  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
682
686
  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
686
690
  + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
687
691
  +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
688
692
  tmp[16 * ix + tid] += sum;
689
- #else
693
+ \n#else\n
690
694
  float sum = 0;
691
695
  for (int l = 0; l < 4; ++l) {
692
696
  sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
695
699
  + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
696
700
  }
697
701
  tmp[16 * ix + tid] += sum;
698
- #endif
702
+ \n#endif\n
699
703
 
700
704
  }
701
705
 
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1376
1380
  const int64_t ne00 = src0->ne[0];
1377
1381
  const int64_t ne01 = src0->ne[1];
1378
1382
  const int64_t ne02 = src0->ne[2];
1379
- const int64_t ne03 = src0->ne[2];
1383
+ const int64_t ne03 = src0->ne[3];
1380
1384
  const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1381
1385
  const int64_t ne10 = src1->ne[0];
1382
1386
  const int64_t ne11 = src1->ne[1];