llama_cpp 0.16.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
#include "norm.cuh"
|
|
2
|
-
|
|
3
|
-
template <int block_size>
|
|
4
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
|
5
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
6
|
-
const int tid = threadIdx.x;
|
|
7
|
-
|
|
8
|
-
float2 mean_var = make_float2(0.f, 0.f);
|
|
9
|
-
|
|
10
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
11
|
-
const float xi = x[row*ncols + col];
|
|
12
|
-
mean_var.x += xi;
|
|
13
|
-
mean_var.y += xi * xi;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
// sum up partial sums
|
|
17
|
-
mean_var = warp_reduce_sum(mean_var);
|
|
18
|
-
if (block_size > WARP_SIZE) {
|
|
19
|
-
__shared__ float2 s_sum[32];
|
|
20
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
|
21
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
|
22
|
-
if (lane_id == 0) {
|
|
23
|
-
s_sum[warp_id] = mean_var;
|
|
24
|
-
}
|
|
25
|
-
__syncthreads();
|
|
26
|
-
mean_var = s_sum[lane_id];
|
|
27
|
-
mean_var = warp_reduce_sum(mean_var);
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
const float mean = mean_var.x / ncols;
|
|
31
|
-
const float var = mean_var.y / ncols - mean * mean;
|
|
32
|
-
const float inv_std = rsqrtf(var + eps);
|
|
33
|
-
|
|
34
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
35
|
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
template <int block_size>
|
|
40
|
-
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
|
41
|
-
// blockIdx.x: num_groups idx
|
|
42
|
-
// threadIdx.x: block_size idx
|
|
43
|
-
int start = blockIdx.x * group_size;
|
|
44
|
-
int end = start + group_size;
|
|
45
|
-
|
|
46
|
-
start += threadIdx.x;
|
|
47
|
-
|
|
48
|
-
if (end >= ne_elements) {
|
|
49
|
-
end = ne_elements;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
float tmp = 0.0f; // partial sum for thread in warp
|
|
53
|
-
|
|
54
|
-
for (int j = start; j < end; j += block_size) {
|
|
55
|
-
tmp += x[j];
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
tmp = warp_reduce_sum(tmp);
|
|
59
|
-
if (block_size > WARP_SIZE) {
|
|
60
|
-
__shared__ float s_sum[32];
|
|
61
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
|
62
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
|
63
|
-
if (lane_id == 0) {
|
|
64
|
-
s_sum[warp_id] = tmp;
|
|
65
|
-
}
|
|
66
|
-
__syncthreads();
|
|
67
|
-
tmp = s_sum[lane_id];
|
|
68
|
-
tmp = warp_reduce_sum(tmp);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
float mean = tmp / group_size;
|
|
72
|
-
tmp = 0.0f;
|
|
73
|
-
|
|
74
|
-
for (int j = start; j < end; j += block_size) {
|
|
75
|
-
float xi = x[j] - mean;
|
|
76
|
-
dst[j] = xi;
|
|
77
|
-
tmp += xi * xi;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
tmp = warp_reduce_sum(tmp);
|
|
81
|
-
if (block_size > WARP_SIZE) {
|
|
82
|
-
__shared__ float s_sum[32];
|
|
83
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
|
84
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
|
85
|
-
if (lane_id == 0) {
|
|
86
|
-
s_sum[warp_id] = tmp;
|
|
87
|
-
}
|
|
88
|
-
__syncthreads();
|
|
89
|
-
tmp = s_sum[lane_id];
|
|
90
|
-
tmp = warp_reduce_sum(tmp);
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
float variance = tmp / group_size;
|
|
94
|
-
float scale = rsqrtf(variance + eps);
|
|
95
|
-
for (int j = start; j < end; j += block_size) {
|
|
96
|
-
dst[j] *= scale;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
template <int block_size>
|
|
101
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
|
102
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
103
|
-
const int tid = threadIdx.x;
|
|
104
|
-
|
|
105
|
-
float tmp = 0.0f; // partial sum for thread in warp
|
|
106
|
-
|
|
107
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
108
|
-
const float xi = x[row*ncols + col];
|
|
109
|
-
tmp += xi * xi;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
// sum up partial sums
|
|
113
|
-
tmp = warp_reduce_sum(tmp);
|
|
114
|
-
if (block_size > WARP_SIZE) {
|
|
115
|
-
__shared__ float s_sum[32];
|
|
116
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
|
117
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
|
118
|
-
if (lane_id == 0) {
|
|
119
|
-
s_sum[warp_id] = tmp;
|
|
120
|
-
}
|
|
121
|
-
__syncthreads();
|
|
122
|
-
tmp = s_sum[lane_id];
|
|
123
|
-
tmp = warp_reduce_sum(tmp);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const float mean = tmp / ncols;
|
|
127
|
-
const float scale = rsqrtf(mean + eps);
|
|
128
|
-
|
|
129
|
-
for (int col = tid; col < ncols; col += block_size) {
|
|
130
|
-
dst[row*ncols + col] = scale * x[row*ncols + col];
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
|
135
|
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
136
|
-
if (ncols < 1024) {
|
|
137
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
138
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
|
139
|
-
} else {
|
|
140
|
-
const dim3 block_dims(1024, 1, 1);
|
|
141
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
|
146
|
-
static const float eps = 1e-6f;
|
|
147
|
-
if (group_size < 1024) {
|
|
148
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
149
|
-
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
|
150
|
-
} else {
|
|
151
|
-
const dim3 block_dims(1024, 1, 1);
|
|
152
|
-
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
|
157
|
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
158
|
-
if (ncols < 1024) {
|
|
159
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
160
|
-
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
|
161
|
-
} else {
|
|
162
|
-
const dim3 block_dims(1024, 1, 1);
|
|
163
|
-
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
168
|
-
const ggml_tensor * src0 = dst->src[0];
|
|
169
|
-
const float * src0_d = (const float *)src0->data;
|
|
170
|
-
float * dst_d = (float *)dst->data;
|
|
171
|
-
cudaStream_t stream = ctx.stream();
|
|
172
|
-
|
|
173
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
174
|
-
|
|
175
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
176
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
177
|
-
|
|
178
|
-
const int64_t ne00 = src0->ne[0];
|
|
179
|
-
const int64_t nrows = ggml_nrows(src0);
|
|
180
|
-
|
|
181
|
-
float eps;
|
|
182
|
-
memcpy(&eps, dst->op_params, sizeof(float));
|
|
183
|
-
|
|
184
|
-
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
188
|
-
const ggml_tensor * src0 = dst->src[0];
|
|
189
|
-
const float * src0_d = (const float *)src0->data;
|
|
190
|
-
float * dst_d = (float *)dst->data;
|
|
191
|
-
cudaStream_t stream = ctx.stream();
|
|
192
|
-
|
|
193
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
194
|
-
|
|
195
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
196
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
197
|
-
|
|
198
|
-
int num_groups = dst->op_params[0];
|
|
199
|
-
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
200
|
-
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
204
|
-
const ggml_tensor * src0 = dst->src[0];
|
|
205
|
-
const float * src0_d = (const float *)src0->data;
|
|
206
|
-
float * dst_d = (float *)dst->data;
|
|
207
|
-
cudaStream_t stream = ctx.stream();
|
|
208
|
-
|
|
209
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
210
|
-
|
|
211
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
212
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
213
|
-
|
|
214
|
-
const int64_t ne00 = src0->ne[0];
|
|
215
|
-
const int64_t nrows = ggml_nrows(src0);
|
|
216
|
-
|
|
217
|
-
float eps;
|
|
218
|
-
memcpy(&eps, dst->op_params, sizeof(float));
|
|
219
|
-
|
|
220
|
-
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
|
221
|
-
}
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
#include "pad.cuh"
|
|
2
|
-
|
|
3
|
-
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
|
4
|
-
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
|
5
|
-
// blockIdx.y: idx of ne1
|
|
6
|
-
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
|
7
|
-
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
8
|
-
if (nidx >= ne0) {
|
|
9
|
-
return;
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
// operation
|
|
13
|
-
int offset_dst =
|
|
14
|
-
nidx +
|
|
15
|
-
blockIdx.y * ne0 +
|
|
16
|
-
blockIdx.z * ne0 * gridDim.y;
|
|
17
|
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
|
18
|
-
int offset_src =
|
|
19
|
-
nidx +
|
|
20
|
-
blockIdx.y * ne00 +
|
|
21
|
-
blockIdx.z * ne00 * ne01;
|
|
22
|
-
dst[offset_dst] = x[offset_src];
|
|
23
|
-
} else {
|
|
24
|
-
dst[offset_dst] = 0.0f;
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
static void pad_f32_cuda(const float * x, float * dst,
|
|
29
|
-
const int ne00, const int ne01, const int ne02, const int ne03,
|
|
30
|
-
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
|
31
|
-
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
|
32
|
-
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
|
33
|
-
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
37
|
-
const ggml_tensor * src0 = dst->src[0];
|
|
38
|
-
const float * src0_d = (const float *)src0->data;
|
|
39
|
-
float * dst_d = (float *)dst->data;
|
|
40
|
-
cudaStream_t stream = ctx.stream();
|
|
41
|
-
|
|
42
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
43
|
-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
44
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
45
|
-
|
|
46
|
-
pad_f32_cuda(src0_d, dst_d,
|
|
47
|
-
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
|
48
|
-
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
|
49
|
-
}
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
#include "pool2d.cuh"
|
|
2
|
-
|
|
3
|
-
template <typename Ti, typename To>
|
|
4
|
-
static __global__ void pool2d_nchw_kernel(
|
|
5
|
-
const int ih, const int iw, const int oh, const int ow,
|
|
6
|
-
const int kh, const int kw, const int sh, const int sw,
|
|
7
|
-
const int ph, const int pw, const int parallel_elements,
|
|
8
|
-
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
|
9
|
-
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
10
|
-
if (idx >= parallel_elements) {
|
|
11
|
-
return;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
const int I_HW = ih * iw;
|
|
15
|
-
const int O_HW = oh * ow;
|
|
16
|
-
const int nc = idx / O_HW;
|
|
17
|
-
const int cur_oh = idx % O_HW / ow;
|
|
18
|
-
const int cur_ow = idx % O_HW % ow;
|
|
19
|
-
const Ti* i_ptr = src + nc * I_HW;
|
|
20
|
-
To* o_ptr = dst + nc * O_HW;
|
|
21
|
-
const int start_h = cur_oh * sh - ph;
|
|
22
|
-
const int bh = max(0, start_h);
|
|
23
|
-
const int eh = min(ih, start_h + kh);
|
|
24
|
-
const int start_w = cur_ow * sw - pw;
|
|
25
|
-
const int bw = max(0, start_w);
|
|
26
|
-
const int ew = min(iw, start_w + kw);
|
|
27
|
-
const To scale = 1. / (kh * kw);
|
|
28
|
-
To res = 0;
|
|
29
|
-
|
|
30
|
-
switch (op) {
|
|
31
|
-
case GGML_OP_POOL_AVG: res = 0; break;
|
|
32
|
-
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
|
33
|
-
default: assert(false);
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
for (int i = bh; i < eh; i += 1) {
|
|
37
|
-
for (int j = bw; j < ew; j += 1) {
|
|
38
|
-
#if __CUDA_ARCH__ >= 350
|
|
39
|
-
Ti cur = __ldg(i_ptr + i * iw + j);
|
|
40
|
-
#else
|
|
41
|
-
Ti cur = i_ptr[i * iw + j];
|
|
42
|
-
#endif
|
|
43
|
-
switch (op) {
|
|
44
|
-
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
|
45
|
-
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
|
46
|
-
default: assert(false);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
o_ptr[cur_oh * ow + cur_ow] = res;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
static void pool2d_nchw_kernel_f32_f32_cuda(
|
|
54
|
-
const int ih, const int iw, const int oh, const int ow,
|
|
55
|
-
const int kh, const int kw, const int sh, const int sw,
|
|
56
|
-
const int ph, const int pw, const int parallel_elements,
|
|
57
|
-
const float * src, float * dst, const enum ggml_op_pool op,
|
|
58
|
-
cudaStream_t stream) {
|
|
59
|
-
|
|
60
|
-
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
|
61
|
-
dim3 block_nums(num_blocks);
|
|
62
|
-
pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
66
|
-
const ggml_tensor * src0 = dst->src[0];
|
|
67
|
-
const float * src0_d = (const float *)src0->data;
|
|
68
|
-
float * dst_d = (float *)dst->data;
|
|
69
|
-
cudaStream_t stream = ctx.stream();
|
|
70
|
-
|
|
71
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
72
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
73
|
-
|
|
74
|
-
const int32_t * opts = (const int32_t *)dst->op_params;
|
|
75
|
-
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
|
76
|
-
const int k0 = opts[1];
|
|
77
|
-
const int k1 = opts[2];
|
|
78
|
-
const int s0 = opts[3];
|
|
79
|
-
const int s1 = opts[4];
|
|
80
|
-
const int p0 = opts[5];
|
|
81
|
-
const int p1 = opts[6];
|
|
82
|
-
|
|
83
|
-
const int64_t IH = src0->ne[1];
|
|
84
|
-
const int64_t IW = src0->ne[0];
|
|
85
|
-
|
|
86
|
-
const int64_t N = dst->ne[3];
|
|
87
|
-
const int64_t OC = dst->ne[2];
|
|
88
|
-
const int64_t OH = dst->ne[1];
|
|
89
|
-
const int64_t OW = dst->ne[0];
|
|
90
|
-
|
|
91
|
-
const int parallel_elements = N * OC * OH * OW;
|
|
92
|
-
|
|
93
|
-
pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
|
|
94
|
-
}
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
#include "quantize.cuh"
|
|
2
|
-
#include <cstdint>
|
|
3
|
-
|
|
4
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
|
5
|
-
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
|
6
|
-
|
|
7
|
-
if (ix0 >= kx0_padded) {
|
|
8
|
-
return;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
const int64_t ix1 = blockIdx.y;
|
|
12
|
-
|
|
13
|
-
const int64_t i_padded = ix1*kx0_padded + ix0;
|
|
14
|
-
|
|
15
|
-
block_q8_1 * y = (block_q8_1 *) vy;
|
|
16
|
-
|
|
17
|
-
const int64_t ib = i_padded / QK8_1; // block index
|
|
18
|
-
const int64_t iqs = i_padded % QK8_1; // quant index
|
|
19
|
-
|
|
20
|
-
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
|
|
21
|
-
float amax = fabsf(xi);
|
|
22
|
-
float sum = xi;
|
|
23
|
-
|
|
24
|
-
amax = warp_reduce_max(amax);
|
|
25
|
-
sum = warp_reduce_sum(sum);
|
|
26
|
-
|
|
27
|
-
const float d = amax / 127;
|
|
28
|
-
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
|
29
|
-
|
|
30
|
-
y[ib].qs[iqs] = q;
|
|
31
|
-
|
|
32
|
-
if (iqs > 0) {
|
|
33
|
-
return;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
|
37
|
-
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
template <bool need_sum>
|
|
41
|
-
static __global__ void quantize_mmq_q8_1(
|
|
42
|
-
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
|
43
|
-
|
|
44
|
-
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
|
45
|
-
|
|
46
|
-
if (ix0 >= kx0_padded) {
|
|
47
|
-
return;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
|
51
|
-
|
|
52
|
-
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
|
53
|
-
|
|
54
|
-
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
|
|
55
|
-
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
|
56
|
-
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
|
57
|
-
|
|
58
|
-
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
|
|
59
|
-
float amax = fabsf(xi);
|
|
60
|
-
|
|
61
|
-
amax = warp_reduce_max(amax);
|
|
62
|
-
|
|
63
|
-
float sum;
|
|
64
|
-
if (need_sum) {
|
|
65
|
-
sum = warp_reduce_sum(xi);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
const float d = amax / 127;
|
|
69
|
-
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
|
70
|
-
|
|
71
|
-
y[ib].qs[iqs] = q;
|
|
72
|
-
|
|
73
|
-
if (iqs % QK8_1 != 0) {
|
|
74
|
-
return;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if (need_sum) {
|
|
78
|
-
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
|
79
|
-
} else {
|
|
80
|
-
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
void quantize_row_q8_1_cuda(
|
|
85
|
-
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
|
86
|
-
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
|
87
|
-
|
|
88
|
-
GGML_ASSERT(kx0_padded % QK8_1 == 0);
|
|
89
|
-
|
|
90
|
-
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
|
91
|
-
const dim3 num_blocks(block_num_x, kx1*channels, 1);
|
|
92
|
-
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
|
93
|
-
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
|
94
|
-
|
|
95
|
-
GGML_UNUSED(type_x);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
void quantize_mmq_q8_1_cuda(
|
|
99
|
-
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
|
100
|
-
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
|
101
|
-
|
|
102
|
-
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
|
103
|
-
|
|
104
|
-
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
|
105
|
-
const dim3 num_blocks(block_num_x, kx1, channels);
|
|
106
|
-
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
|
107
|
-
if (mmq_need_sum(type_x)) {
|
|
108
|
-
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
|
109
|
-
} else {
|
|
110
|
-
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
|
111
|
-
}
|
|
112
|
-
}
|