llama_cpp 0.12.5 → 0.12.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11578
11578
  }
11579
11579
  char * dst_ptr = (char *) dst;
11580
11580
 
11581
- const int64_t ne0 = src->ne[0];
11582
- const int64_t nb0 = src->nb[0];
11583
- const int64_t nb1 = src->nb[1];
11584
- const int64_t nb2 = src->nb[2];
11585
- const int64_t nb3 = src->nb[3];
11581
+ GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
11582
+ GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
11586
11583
  const enum ggml_type type = src->type;
11587
11584
  const int64_t ts = ggml_type_size(type);
11588
11585
  const int64_t bs = ggml_blck_size(type);
@@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
12426
12423
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12427
12424
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
12428
12425
 
12429
- const int64_t ne00 = src0->ne[0];
12430
- const int64_t ne01 = src0->ne[1];
12431
- const int64_t ne02 = src0->ne[2];
12426
+ GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
12432
12427
  const int64_t nrows = ggml_nrows(src0);
12433
12428
 
12434
12429
  //const int n_past = ((int32_t *) dst->op_params)[0];
@@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12758
12753
  ggml_sycl_op_mul_mat_t op,
12759
12754
  const bool convert_src1_to_q8_1) try {
12760
12755
 
12761
- const int64_t ne00 = src0->ne[0];
12762
- const int64_t ne01 = src0->ne[1];
12763
- const int64_t ne02 = src0->ne[2];
12764
- const int64_t ne03 = src0->ne[3];
12756
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
12765
12757
 
12766
- const int64_t ne10 = src1->ne[0];
12767
- const int64_t ne11 = src1->ne[1];
12768
- const int64_t ne12 = src1->ne[2];
12769
- const int64_t ne13 = src1->ne[3];
12758
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
12770
12759
  const int64_t nrows1 = ggml_nrows(src1);
12771
12760
 
12772
12761
  GGML_ASSERT(ne03 == ne13);
@@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13337
13326
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13338
13327
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13339
13328
 
13340
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
13341
- const int64_t ne01 = src0->ne[1];
13342
- const int64_t ne02 = src0->ne[2];
13343
- const int64_t ne03 = src0->ne[3];
13329
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13344
13330
 
13345
- const int64_t nb01 = src0->nb[1];
13346
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
13347
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
13331
+ GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13348
13332
 
13349
- const int64_t ne10 = src1->ne[0];
13350
- const int64_t ne11 = src1->ne[1];
13351
- const int64_t ne12 = src1->ne[2];
13352
- const int64_t ne13 = src1->ne[3];
13333
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13353
13334
 
13354
- const int64_t nb11 = src1->nb[1];
13355
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13356
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13335
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13357
13336
 
13358
13337
  const int64_t ne1 = ggml_nelements(src1);
13359
13338
  const int64_t ne = ggml_nelements(dst);
@@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13655
13634
  GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13656
13635
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13657
13636
 
13658
- const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
13659
- const int64_t ne01 = src00->ne[1];
13660
- const int64_t ne02 = src00->ne[2];
13661
- const int64_t ne03 = src00->ne[3];
13637
+ GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
13662
13638
 
13663
13639
  //const int64_t nb01 = src00->nb[1];
13664
- const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
13665
- const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
13640
+ GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
13666
13641
 
13667
- const int64_t ne10 = src1->ne[0];
13668
- const int64_t ne11 = src1->ne[1];
13669
- const int64_t ne12 = src1->ne[2];
13670
- const int64_t ne13 = src1->ne[3];
13642
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13671
13643
 
13644
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13672
13645
  //const int64_t nb11 = src1->nb[1];
13673
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13674
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13675
13646
 
13676
13647
  const int64_t ne1 = ggml_nelements(src1);
13677
13648
  const int64_t ne = ggml_nelements(dst);
@@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13940
13911
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13941
13912
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
13942
13913
 
13943
- const int64_t ne00 = src0->ne[0];
13944
- const int64_t ne01 = src0->ne[1];
13945
- const int64_t ne02 = src0->ne[2];
13946
-
13947
-
13948
- const int64_t nb00 = src0->nb[0];
13949
- const int64_t nb01 = src0->nb[1];
13950
- const int64_t nb02 = src0->nb[2];
13951
- const int64_t nb03 = src0->nb[3];
13952
-
13953
- const int64_t ne10 = src1->ne[0];
13954
- const int64_t ne11 = src1->ne[1];
13955
- const int64_t ne12 = src1->ne[2];
13956
-
13957
-
13958
- const int64_t nb10 = src1->nb[0];
13959
- const int64_t nb11 = src1->nb[1];
13960
- const int64_t nb12 = src1->nb[2];
13961
- const int64_t nb13 = src1->nb[3];
13914
+ GGML_TENSOR_BINARY_OP_LOCALS;
13962
13915
 
13963
13916
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13964
13917
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -27,6 +27,7 @@
27
27
  #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
28
28
 
29
29
  #define VK_VENDOR_ID_AMD 0x1002
30
+ #define VK_VENDOR_ID_APPLE 0x106b
30
31
  #define VK_VENDOR_ID_INTEL 0x8086
31
32
  #define VK_VENDOR_ID_NVIDIA 0x10de
32
33
 
@@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
2034
2035
  return ctx->pipeline_matmul_f32_aligned_l.align;
2035
2036
  }
2036
2037
 
2038
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2039
+ if (bit16_x && bit16_y) {
2040
+ if (m <= 32 || n <= 32) {
2041
+ #ifdef GGML_VULKAN_DEBUG
2042
+ std::cerr << " S" << std::endl;
2043
+ #endif
2044
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2045
+ }
2046
+ #ifdef GGML_VULKAN_DEBUG
2047
+ std::cerr << " M" << std::endl;
2048
+ #endif
2049
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2050
+ }
2051
+ if (bit16_x && !bit16_y) {
2052
+ if (m <= 32 || n <= 32) {
2053
+ #ifdef GGML_VULKAN_DEBUG
2054
+ std::cerr << " S" << std::endl;
2055
+ #endif
2056
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2057
+ }
2058
+ #ifdef GGML_VULKAN_DEBUG
2059
+ std::cerr << " M" << std::endl;
2060
+ #endif
2061
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2062
+ }
2063
+ if (!bit16_x && bit16_y) {
2064
+ GGML_ASSERT(false);
2065
+ }
2066
+
2067
+ if (m <= 32 || n <= 32) {
2068
+ #ifdef GGML_VULKAN_DEBUG
2069
+ std::cerr << " S" << std::endl;
2070
+ #endif
2071
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2072
+ }
2073
+ #ifdef GGML_VULKAN_DEBUG
2074
+ std::cerr << " M" << std::endl;
2075
+ #endif
2076
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2077
+ }
2078
+
2079
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2080
+ #ifdef GGML_VULKAN_DEBUG
2081
+ std::cerr << " M" << std::endl;
2082
+ #endif
2083
+ if (bit16_x && bit16_y) {
2084
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2085
+ }
2086
+ if (bit16_x && !bit16_y) {
2087
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2088
+ }
2089
+ if (!bit16_x && bit16_y) {
2090
+ GGML_ASSERT(false);
2091
+ }
2092
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2093
+ }
2094
+
2095
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2096
+ #ifdef GGML_VULKAN_DEBUG
2097
+ std::cerr << " S" << std::endl;
2098
+ #endif
2099
+ if (bit16_x && bit16_y) {
2100
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2101
+ }
2102
+ if (bit16_x && !bit16_y) {
2103
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2104
+ }
2105
+ if (!bit16_x && bit16_y) {
2106
+ GGML_ASSERT(false);
2107
+ }
2108
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2109
+ }
2110
+
2037
2111
  static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2038
2112
  #ifdef GGML_VULKAN_DEBUG
2039
2113
  std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
2040
2114
  #endif
2115
+ switch (ctx->device.lock()->vendor_id) {
2116
+ case VK_VENDOR_ID_AMD:
2117
+ return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
2118
+ case VK_VENDOR_ID_APPLE:
2119
+ return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
2120
+ case VK_VENDOR_ID_INTEL:
2121
+ return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
2122
+ }
2123
+
2041
2124
  if (bit16_x && bit16_y) {
2042
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2125
+ if (m <= 32 || n <= 32) {
2043
2126
  #ifdef GGML_VULKAN_DEBUG
2044
2127
  std::cerr << " S" << std::endl;
2045
2128
  #endif
2046
2129
  return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2047
2130
  }
2048
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2131
+ if (m <= 64 || n <= 64) {
2049
2132
  #ifdef GGML_VULKAN_DEBUG
2050
2133
  std::cerr << " M" << std::endl;
2051
2134
  #endif
@@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2057
2140
  return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
2058
2141
  }
2059
2142
  if (bit16_x && !bit16_y) {
2060
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2143
+ if (m <= 32 || n <= 32) {
2061
2144
  #ifdef GGML_VULKAN_DEBUG
2062
2145
  std::cerr << " S" << std::endl;
2063
2146
  #endif
2064
2147
  return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2065
2148
  }
2066
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2149
+ if (m <= 64 || n <= 64) {
2067
2150
  #ifdef GGML_VULKAN_DEBUG
2068
2151
  std::cerr << " M" << std::endl;
2069
2152
  #endif
@@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2078
2161
  GGML_ASSERT(false);
2079
2162
  }
2080
2163
 
2081
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2164
+ if (m <= 32 || n <= 32) {
2082
2165
  #ifdef GGML_VULKAN_DEBUG
2083
2166
  std::cerr << " S" << std::endl;
2084
2167
  #endif
2085
2168
  return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2086
2169
  }
2087
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2170
+ if (m <= 64 || n <= 64) {
2088
2171
  #ifdef GGML_VULKAN_DEBUG
2089
2172
  std::cerr << " M" << std::endl;
2090
2173
  #endif