llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11578
11578
  }
11579
11579
  char * dst_ptr = (char *) dst;
11580
11580
 
11581
- const int64_t ne0 = src->ne[0];
11582
- const int64_t nb0 = src->nb[0];
11583
- const int64_t nb1 = src->nb[1];
11584
- const int64_t nb2 = src->nb[2];
11585
- const int64_t nb3 = src->nb[3];
11581
+ GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
11582
+ GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
11586
11583
  const enum ggml_type type = src->type;
11587
11584
  const int64_t ts = ggml_type_size(type);
11588
11585
  const int64_t bs = ggml_blck_size(type);
@@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
12426
12423
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12427
12424
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
12428
12425
 
12429
- const int64_t ne00 = src0->ne[0];
12430
- const int64_t ne01 = src0->ne[1];
12431
- const int64_t ne02 = src0->ne[2];
12426
+ GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
12432
12427
  const int64_t nrows = ggml_nrows(src0);
12433
12428
 
12434
12429
  //const int n_past = ((int32_t *) dst->op_params)[0];
@@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12758
12753
  ggml_sycl_op_mul_mat_t op,
12759
12754
  const bool convert_src1_to_q8_1) try {
12760
12755
 
12761
- const int64_t ne00 = src0->ne[0];
12762
- const int64_t ne01 = src0->ne[1];
12763
- const int64_t ne02 = src0->ne[2];
12764
- const int64_t ne03 = src0->ne[3];
12756
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
12765
12757
 
12766
- const int64_t ne10 = src1->ne[0];
12767
- const int64_t ne11 = src1->ne[1];
12768
- const int64_t ne12 = src1->ne[2];
12769
- const int64_t ne13 = src1->ne[3];
12758
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
12770
12759
  const int64_t nrows1 = ggml_nrows(src1);
12771
12760
 
12772
12761
  GGML_ASSERT(ne03 == ne13);
@@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13337
13326
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13338
13327
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13339
13328
 
13340
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
13341
- const int64_t ne01 = src0->ne[1];
13342
- const int64_t ne02 = src0->ne[2];
13343
- const int64_t ne03 = src0->ne[3];
13329
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13344
13330
 
13345
- const int64_t nb01 = src0->nb[1];
13346
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
13347
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
13331
+ GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13348
13332
 
13349
- const int64_t ne10 = src1->ne[0];
13350
- const int64_t ne11 = src1->ne[1];
13351
- const int64_t ne12 = src1->ne[2];
13352
- const int64_t ne13 = src1->ne[3];
13333
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13353
13334
 
13354
- const int64_t nb11 = src1->nb[1];
13355
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13356
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13335
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13357
13336
 
13358
13337
  const int64_t ne1 = ggml_nelements(src1);
13359
13338
  const int64_t ne = ggml_nelements(dst);
@@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13655
13634
  GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13656
13635
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13657
13636
 
13658
- const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
13659
- const int64_t ne01 = src00->ne[1];
13660
- const int64_t ne02 = src00->ne[2];
13661
- const int64_t ne03 = src00->ne[3];
13637
+ GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
13662
13638
 
13663
13639
  //const int64_t nb01 = src00->nb[1];
13664
- const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
13665
- const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
13640
+ GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
13666
13641
 
13667
- const int64_t ne10 = src1->ne[0];
13668
- const int64_t ne11 = src1->ne[1];
13669
- const int64_t ne12 = src1->ne[2];
13670
- const int64_t ne13 = src1->ne[3];
13642
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13671
13643
 
13644
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13672
13645
  //const int64_t nb11 = src1->nb[1];
13673
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13674
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13675
13646
 
13676
13647
  const int64_t ne1 = ggml_nelements(src1);
13677
13648
  const int64_t ne = ggml_nelements(dst);
@@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13940
13911
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13941
13912
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
13942
13913
 
13943
- const int64_t ne00 = src0->ne[0];
13944
- const int64_t ne01 = src0->ne[1];
13945
- const int64_t ne02 = src0->ne[2];
13946
-
13947
-
13948
- const int64_t nb00 = src0->nb[0];
13949
- const int64_t nb01 = src0->nb[1];
13950
- const int64_t nb02 = src0->nb[2];
13951
- const int64_t nb03 = src0->nb[3];
13952
-
13953
- const int64_t ne10 = src1->ne[0];
13954
- const int64_t ne11 = src1->ne[1];
13955
- const int64_t ne12 = src1->ne[2];
13956
-
13957
-
13958
- const int64_t nb10 = src1->nb[0];
13959
- const int64_t nb11 = src1->nb[1];
13960
- const int64_t nb12 = src1->nb[2];
13961
- const int64_t nb13 = src1->nb[3];
13914
+ GGML_TENSOR_BINARY_OP_LOCALS;
13962
13915
 
13963
13916
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13964
13917
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -27,6 +27,7 @@
27
27
  #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
28
28
 
29
29
  #define VK_VENDOR_ID_AMD 0x1002
30
+ #define VK_VENDOR_ID_APPLE 0x106b
30
31
  #define VK_VENDOR_ID_INTEL 0x8086
31
32
  #define VK_VENDOR_ID_NVIDIA 0x10de
32
33
 
@@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
2034
2035
  return ctx->pipeline_matmul_f32_aligned_l.align;
2035
2036
  }
2036
2037
 
2038
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2039
+ if (bit16_x && bit16_y) {
2040
+ if (m <= 32 || n <= 32) {
2041
+ #ifdef GGML_VULKAN_DEBUG
2042
+ std::cerr << " S" << std::endl;
2043
+ #endif
2044
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2045
+ }
2046
+ #ifdef GGML_VULKAN_DEBUG
2047
+ std::cerr << " M" << std::endl;
2048
+ #endif
2049
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2050
+ }
2051
+ if (bit16_x && !bit16_y) {
2052
+ if (m <= 32 || n <= 32) {
2053
+ #ifdef GGML_VULKAN_DEBUG
2054
+ std::cerr << " S" << std::endl;
2055
+ #endif
2056
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2057
+ }
2058
+ #ifdef GGML_VULKAN_DEBUG
2059
+ std::cerr << " M" << std::endl;
2060
+ #endif
2061
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2062
+ }
2063
+ if (!bit16_x && bit16_y) {
2064
+ GGML_ASSERT(false);
2065
+ }
2066
+
2067
+ if (m <= 32 || n <= 32) {
2068
+ #ifdef GGML_VULKAN_DEBUG
2069
+ std::cerr << " S" << std::endl;
2070
+ #endif
2071
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2072
+ }
2073
+ #ifdef GGML_VULKAN_DEBUG
2074
+ std::cerr << " M" << std::endl;
2075
+ #endif
2076
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2077
+ }
2078
+
2079
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2080
+ #ifdef GGML_VULKAN_DEBUG
2081
+ std::cerr << " M" << std::endl;
2082
+ #endif
2083
+ if (bit16_x && bit16_y) {
2084
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2085
+ }
2086
+ if (bit16_x && !bit16_y) {
2087
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2088
+ }
2089
+ if (!bit16_x && bit16_y) {
2090
+ GGML_ASSERT(false);
2091
+ }
2092
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2093
+ }
2094
+
2095
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2096
+ #ifdef GGML_VULKAN_DEBUG
2097
+ std::cerr << " S" << std::endl;
2098
+ #endif
2099
+ if (bit16_x && bit16_y) {
2100
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2101
+ }
2102
+ if (bit16_x && !bit16_y) {
2103
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2104
+ }
2105
+ if (!bit16_x && bit16_y) {
2106
+ GGML_ASSERT(false);
2107
+ }
2108
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2109
+ }
2110
+
2037
2111
  static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2038
2112
  #ifdef GGML_VULKAN_DEBUG
2039
2113
  std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
2040
2114
  #endif
2115
+ switch (ctx->device.lock()->vendor_id) {
2116
+ case VK_VENDOR_ID_AMD:
2117
+ return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
2118
+ case VK_VENDOR_ID_APPLE:
2119
+ return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
2120
+ case VK_VENDOR_ID_INTEL:
2121
+ return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
2122
+ }
2123
+
2041
2124
  if (bit16_x && bit16_y) {
2042
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2125
+ if (m <= 32 || n <= 32) {
2043
2126
  #ifdef GGML_VULKAN_DEBUG
2044
2127
  std::cerr << " S" << std::endl;
2045
2128
  #endif
2046
2129
  return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2047
2130
  }
2048
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2131
+ if (m <= 64 || n <= 64) {
2049
2132
  #ifdef GGML_VULKAN_DEBUG
2050
2133
  std::cerr << " M" << std::endl;
2051
2134
  #endif
@@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2057
2140
  return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
2058
2141
  }
2059
2142
  if (bit16_x && !bit16_y) {
2060
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2143
+ if (m <= 32 || n <= 32) {
2061
2144
  #ifdef GGML_VULKAN_DEBUG
2062
2145
  std::cerr << " S" << std::endl;
2063
2146
  #endif
2064
2147
  return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2065
2148
  }
2066
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2149
+ if (m <= 64 || n <= 64) {
2067
2150
  #ifdef GGML_VULKAN_DEBUG
2068
2151
  std::cerr << " M" << std::endl;
2069
2152
  #endif
@@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2078
2161
  GGML_ASSERT(false);
2079
2162
  }
2080
2163
 
2081
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2164
+ if (m <= 32 || n <= 32) {
2082
2165
  #ifdef GGML_VULKAN_DEBUG
2083
2166
  std::cerr << " S" << std::endl;
2084
2167
  #endif
2085
2168
  return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2086
2169
  }
2087
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2170
+ if (m <= 64 || n <= 64) {
2088
2171
  #ifdef GGML_VULKAN_DEBUG
2089
2172
  std::cerr << " M" << std::endl;
2090
2173
  #endif