llama_cpp 0.15.4 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -8928,49 +8928,6 @@ static void rope_neox(
8928
8928
  dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
8929
8929
  }
8930
8930
 
8931
- static void rope_glm_f32(
8932
- const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
8933
- int n_ctx
8934
- , const sycl::nd_item<3> &item_ct1) {
8935
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
8936
- item_ct1.get_local_id(2);
8937
- const int half_n_dims = ncols/4;
8938
-
8939
- if (col >= half_n_dims) {
8940
- return;
8941
- }
8942
-
8943
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8944
- item_ct1.get_local_id(1);
8945
- const int i = row*ncols + col;
8946
- const int i2 = row/p_delta_rows;
8947
-
8948
- const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
8949
- // FIXME: this is likely wrong
8950
- const int p = pos != nullptr ? pos[i2] : 0;
8951
-
8952
- const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
8953
- const float sin_theta = sycl::sin((float)theta);
8954
- const float cos_theta = sycl::cos((float)theta);
8955
-
8956
- const float x0 = x[i + 0];
8957
- const float x1 = x[i + half_n_dims];
8958
-
8959
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
8960
- dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
8961
-
8962
- const float block_theta =
8963
- ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
8964
- const float sin_block_theta = sycl::sin((float)block_theta);
8965
- const float cos_block_theta = sycl::cos((float)block_theta);
8966
-
8967
- const float x2 = x[i + half_n_dims * 2];
8968
- const float x3 = x[i + half_n_dims * 3];
8969
-
8970
- dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
8971
- dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
8972
- }
8973
-
8974
8931
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
8975
8932
  const sycl::nd_item<3> &item_ct1) {
8976
8933
  const int row = item_ct1.get_group(1);
@@ -9151,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
9151
9108
  // find the sum of exps in the block
9152
9109
  tmp = warp_reduce_sum(tmp, item_ct1);
9153
9110
  if (block_size > WARP_SIZE) {
9111
+ item_ct1.barrier(sycl::access::fence_space::local_space);
9154
9112
  if (warp_id == 0) {
9155
9113
  buf[lane_id] = 0.f;
9156
9114
  }
@@ -12520,22 +12478,6 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12520
12478
  }
12521
12479
  }
12522
12480
 
12523
- static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12524
- const int32_t *pos, float freq_scale,
12525
- int p_delta_rows, float freq_base, int n_ctx,
12526
- dpct::queue_ptr stream) {
12527
- GGML_ASSERT(ncols % 4 == 0);
12528
- const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
12529
- const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
12530
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12531
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12532
- [=](sycl::nd_item<3> item_ct1) {
12533
- rope_glm_f32(x, dst, ncols, pos, freq_scale,
12534
- p_delta_rows, freq_base, n_ctx,
12535
- item_ct1);
12536
- });
12537
- }
12538
-
12539
12481
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12540
12482
  const int nrows, dpct::queue_ptr stream) {
12541
12483
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13147,10 +13089,12 @@ void *ggml_sycl_host_malloc(size_t size) try {
13147
13089
  return nullptr;
13148
13090
  }
13149
13091
 
13092
+ ggml_sycl_set_device(g_main_device);
13093
+ dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
13094
+
13150
13095
  void * ptr = nullptr;
13151
- //allow to use dpct::get_in_order_queue() for host malloc
13152
13096
  dpct::err0 err = CHECK_TRY_ERROR(
13153
- ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
13097
+ ptr = (void *)sycl::malloc_host(size, *main_stream));
13154
13098
 
13155
13099
  if (err != 0) {
13156
13100
  // clear the error
@@ -13171,8 +13115,9 @@ catch (sycl::exception const &exc) {
13171
13115
  }
13172
13116
 
13173
13117
  void ggml_sycl_host_free(void *ptr) try {
13174
- //allow to use dpct::get_in_order_queue() for host malloc
13175
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
13118
+ ggml_sycl_set_device(g_main_device);
13119
+ dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
13120
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
13176
13121
  }
13177
13122
  catch (sycl::exception const &exc) {
13178
13123
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -14066,8 +14011,8 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14066
14011
  //const int n_past = ((int32_t *) dst->op_params)[0];
14067
14012
  const int n_dims = ((int32_t *) dst->op_params)[1];
14068
14013
  const int mode = ((int32_t *) dst->op_params)[2];
14069
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14070
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14014
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14015
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14071
14016
 
14072
14017
  // RoPE alteration for extended context
14073
14018
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -14087,7 +14032,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14087
14032
  }
14088
14033
 
14089
14034
  const bool is_neox = mode & 2;
14090
- const bool is_glm = mode & 4;
14035
+
14036
+ #pragma message("TODO: update rope NORM mode to match NEOX mode")
14037
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
14091
14038
 
14092
14039
  if (is_neox) {
14093
14040
  pos = (const int32_t *) src1_dd;
@@ -14100,13 +14047,10 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14100
14047
  }
14101
14048
 
14102
14049
  rope_corr_dims corr_dims;
14103
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
14050
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
14104
14051
 
14105
14052
  // compute
14106
- if (is_glm) {
14107
- GGML_ASSERT(false);
14108
- rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
14109
- } else if (is_neox) {
14053
+ if (is_neox) {
14110
14054
  if (src0->type == GGML_TYPE_F32) {
14111
14055
  rope_neox_sycl(
14112
14056
  (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
@@ -16631,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
16631
16575
  UNUSED(buft);
16632
16576
  }
16633
16577
 
16634
- GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
16635
- if (!ggml_backend_is_sycl(backend)) {
16636
- return false;
16637
- }
16638
- ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
16639
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
16640
- return buft_ctx->device == sycl_ctx->device;
16641
- }
16642
-
16643
16578
  static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16644
16579
  /* .get_name = */ ggml_backend_sycl_buffer_type_name,
16645
16580
  /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
16646
16581
  /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
16647
16582
  /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
16648
16583
  /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
16649
- /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
16650
16584
  /* .is_host = */ nullptr,
16651
16585
  };
16652
16586
 
@@ -16998,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
16998
16932
  return total_size;
16999
16933
  }
17000
16934
 
17001
- GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
17002
- return ggml_backend_is_sycl(backend);
17003
-
17004
- UNUSED(buft);
17005
- }
17006
-
17007
16935
  GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
17008
16936
  return false;
17009
16937
 
@@ -17016,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
17016
16944
  /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
17017
16945
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
17018
16946
  /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
17019
- /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
17020
16947
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
17021
16948
  };
17022
16949
 
@@ -17102,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
17102
17029
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
17103
17030
  /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
17104
17031
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
17105
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
17106
17032
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
17107
17033
  },
17108
17034
  /* .context = */ nullptr,
@@ -17246,7 +17172,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17246
17172
  case GGML_UNARY_OP_HARDSWISH:
17247
17173
  case GGML_UNARY_OP_GELU_QUICK:
17248
17174
  case GGML_UNARY_OP_TANH:
17249
- return true;
17175
+ return ggml_is_contiguous(op->src[0]);
17250
17176
  default:
17251
17177
  return false;
17252
17178
  }
@@ -17367,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
17367
17293
  GGML_UNUSED(backend);
17368
17294
  }
17369
17295
 
17296
+ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
17297
+ if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
17298
+ return false;
17299
+ }
17300
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
17301
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
17302
+ return buft_ctx->device == sycl_ctx->device;
17303
+ }
17370
17304
 
17371
17305
  static ggml_backend_i ggml_backend_sycl_interface = {
17372
17306
  /* .get_name = */ ggml_backend_sycl_name,
@@ -17378,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17378
17312
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17379
17313
  /* .graph_plan_create = */ NULL,
17380
17314
  /* .graph_plan_free = */ NULL,
17315
+ /* .graph_plan_update = */ NULL,
17381
17316
  /* .graph_plan_compute = */ NULL,
17382
17317
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17383
17318
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17319
+ /* .supports_buft = */ ggml_backend_sycl_supports_buft,
17384
17320
  /* .offload_op = */ ggml_backend_sycl_offload_op,
17385
17321
  /* .event_new = */ NULL,
17386
17322
  /* .event_free = */ NULL,