llama_cpp 0.15.4 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -8928,49 +8928,6 @@ static void rope_neox(
8928
8928
  dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
8929
8929
  }
8930
8930
 
8931
- static void rope_glm_f32(
8932
- const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
8933
- int n_ctx
8934
- , const sycl::nd_item<3> &item_ct1) {
8935
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
8936
- item_ct1.get_local_id(2);
8937
- const int half_n_dims = ncols/4;
8938
-
8939
- if (col >= half_n_dims) {
8940
- return;
8941
- }
8942
-
8943
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8944
- item_ct1.get_local_id(1);
8945
- const int i = row*ncols + col;
8946
- const int i2 = row/p_delta_rows;
8947
-
8948
- const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
8949
- // FIXME: this is likely wrong
8950
- const int p = pos != nullptr ? pos[i2] : 0;
8951
-
8952
- const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
8953
- const float sin_theta = sycl::sin((float)theta);
8954
- const float cos_theta = sycl::cos((float)theta);
8955
-
8956
- const float x0 = x[i + 0];
8957
- const float x1 = x[i + half_n_dims];
8958
-
8959
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
8960
- dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
8961
-
8962
- const float block_theta =
8963
- ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
8964
- const float sin_block_theta = sycl::sin((float)block_theta);
8965
- const float cos_block_theta = sycl::cos((float)block_theta);
8966
-
8967
- const float x2 = x[i + half_n_dims * 2];
8968
- const float x3 = x[i + half_n_dims * 3];
8969
-
8970
- dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
8971
- dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
8972
- }
8973
-
8974
8931
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
8975
8932
  const sycl::nd_item<3> &item_ct1) {
8976
8933
  const int row = item_ct1.get_group(1);
@@ -9151,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
9151
9108
  // find the sum of exps in the block
9152
9109
  tmp = warp_reduce_sum(tmp, item_ct1);
9153
9110
  if (block_size > WARP_SIZE) {
9111
+ item_ct1.barrier(sycl::access::fence_space::local_space);
9154
9112
  if (warp_id == 0) {
9155
9113
  buf[lane_id] = 0.f;
9156
9114
  }
@@ -12520,22 +12478,6 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12520
12478
  }
12521
12479
  }
12522
12480
 
12523
- static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12524
- const int32_t *pos, float freq_scale,
12525
- int p_delta_rows, float freq_base, int n_ctx,
12526
- dpct::queue_ptr stream) {
12527
- GGML_ASSERT(ncols % 4 == 0);
12528
- const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
12529
- const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
12530
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12531
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12532
- [=](sycl::nd_item<3> item_ct1) {
12533
- rope_glm_f32(x, dst, ncols, pos, freq_scale,
12534
- p_delta_rows, freq_base, n_ctx,
12535
- item_ct1);
12536
- });
12537
- }
12538
-
12539
12481
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12540
12482
  const int nrows, dpct::queue_ptr stream) {
12541
12483
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13147,10 +13089,12 @@ void *ggml_sycl_host_malloc(size_t size) try {
13147
13089
  return nullptr;
13148
13090
  }
13149
13091
 
13092
+ ggml_sycl_set_device(g_main_device);
13093
+ dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
13094
+
13150
13095
  void * ptr = nullptr;
13151
- //allow to use dpct::get_in_order_queue() for host malloc
13152
13096
  dpct::err0 err = CHECK_TRY_ERROR(
13153
- ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
13097
+ ptr = (void *)sycl::malloc_host(size, *main_stream));
13154
13098
 
13155
13099
  if (err != 0) {
13156
13100
  // clear the error
@@ -13171,8 +13115,9 @@ catch (sycl::exception const &exc) {
13171
13115
  }
13172
13116
 
13173
13117
  void ggml_sycl_host_free(void *ptr) try {
13174
- //allow to use dpct::get_in_order_queue() for host malloc
13175
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
13118
+ ggml_sycl_set_device(g_main_device);
13119
+ dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
13120
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
13176
13121
  }
13177
13122
  catch (sycl::exception const &exc) {
13178
13123
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -14066,8 +14011,8 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14066
14011
  //const int n_past = ((int32_t *) dst->op_params)[0];
14067
14012
  const int n_dims = ((int32_t *) dst->op_params)[1];
14068
14013
  const int mode = ((int32_t *) dst->op_params)[2];
14069
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14070
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14014
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14015
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14071
14016
 
14072
14017
  // RoPE alteration for extended context
14073
14018
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -14087,7 +14032,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14087
14032
  }
14088
14033
 
14089
14034
  const bool is_neox = mode & 2;
14090
- const bool is_glm = mode & 4;
14035
+
14036
+ #pragma message("TODO: update rope NORM mode to match NEOX mode")
14037
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
14091
14038
 
14092
14039
  if (is_neox) {
14093
14040
  pos = (const int32_t *) src1_dd;
@@ -14100,13 +14047,10 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14100
14047
  }
14101
14048
 
14102
14049
  rope_corr_dims corr_dims;
14103
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
14050
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
14104
14051
 
14105
14052
  // compute
14106
- if (is_glm) {
14107
- GGML_ASSERT(false);
14108
- rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
14109
- } else if (is_neox) {
14053
+ if (is_neox) {
14110
14054
  if (src0->type == GGML_TYPE_F32) {
14111
14055
  rope_neox_sycl(
14112
14056
  (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
@@ -16631,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
16631
16575
  UNUSED(buft);
16632
16576
  }
16633
16577
 
16634
- GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
16635
- if (!ggml_backend_is_sycl(backend)) {
16636
- return false;
16637
- }
16638
- ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
16639
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
16640
- return buft_ctx->device == sycl_ctx->device;
16641
- }
16642
-
16643
16578
  static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16644
16579
  /* .get_name = */ ggml_backend_sycl_buffer_type_name,
16645
16580
  /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
16646
16581
  /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
16647
16582
  /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
16648
16583
  /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
16649
- /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
16650
16584
  /* .is_host = */ nullptr,
16651
16585
  };
16652
16586
 
@@ -16998,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
16998
16932
  return total_size;
16999
16933
  }
17000
16934
 
17001
- GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
17002
- return ggml_backend_is_sycl(backend);
17003
-
17004
- UNUSED(buft);
17005
- }
17006
-
17007
16935
  GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
17008
16936
  return false;
17009
16937
 
@@ -17016,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
17016
16944
  /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
17017
16945
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
17018
16946
  /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
17019
- /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
17020
16947
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
17021
16948
  };
17022
16949
 
@@ -17102,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
17102
17029
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
17103
17030
  /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
17104
17031
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
17105
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
17106
17032
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
17107
17033
  },
17108
17034
  /* .context = */ nullptr,
@@ -17246,7 +17172,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17246
17172
  case GGML_UNARY_OP_HARDSWISH:
17247
17173
  case GGML_UNARY_OP_GELU_QUICK:
17248
17174
  case GGML_UNARY_OP_TANH:
17249
- return true;
17175
+ return ggml_is_contiguous(op->src[0]);
17250
17176
  default:
17251
17177
  return false;
17252
17178
  }
@@ -17367,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
17367
17293
  GGML_UNUSED(backend);
17368
17294
  }
17369
17295
 
17296
+ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
17297
+ if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
17298
+ return false;
17299
+ }
17300
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
17301
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
17302
+ return buft_ctx->device == sycl_ctx->device;
17303
+ }
17370
17304
 
17371
17305
  static ggml_backend_i ggml_backend_sycl_interface = {
17372
17306
  /* .get_name = */ ggml_backend_sycl_name,
@@ -17378,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17378
17312
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17379
17313
  /* .graph_plan_create = */ NULL,
17380
17314
  /* .graph_plan_free = */ NULL,
17315
+ /* .graph_plan_update = */ NULL,
17381
17316
  /* .graph_plan_compute = */ NULL,
17382
17317
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17383
17318
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17319
+ /* .supports_buft = */ ggml_backend_sycl_supports_buft,
17384
17320
  /* .offload_op = */ ggml_backend_sycl_offload_op,
17385
17321
  /* .event_new = */ NULL,
17386
17322
  /* .event_free = */ NULL,