llama_cpp 0.16.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,221 +0,0 @@
|
|
1
|
-
#include "norm.cuh"
|
2
|
-
|
3
|
-
template <int block_size>
|
4
|
-
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
5
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
6
|
-
const int tid = threadIdx.x;
|
7
|
-
|
8
|
-
float2 mean_var = make_float2(0.f, 0.f);
|
9
|
-
|
10
|
-
for (int col = tid; col < ncols; col += block_size) {
|
11
|
-
const float xi = x[row*ncols + col];
|
12
|
-
mean_var.x += xi;
|
13
|
-
mean_var.y += xi * xi;
|
14
|
-
}
|
15
|
-
|
16
|
-
// sum up partial sums
|
17
|
-
mean_var = warp_reduce_sum(mean_var);
|
18
|
-
if (block_size > WARP_SIZE) {
|
19
|
-
__shared__ float2 s_sum[32];
|
20
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
21
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
22
|
-
if (lane_id == 0) {
|
23
|
-
s_sum[warp_id] = mean_var;
|
24
|
-
}
|
25
|
-
__syncthreads();
|
26
|
-
mean_var = s_sum[lane_id];
|
27
|
-
mean_var = warp_reduce_sum(mean_var);
|
28
|
-
}
|
29
|
-
|
30
|
-
const float mean = mean_var.x / ncols;
|
31
|
-
const float var = mean_var.y / ncols - mean * mean;
|
32
|
-
const float inv_std = rsqrtf(var + eps);
|
33
|
-
|
34
|
-
for (int col = tid; col < ncols; col += block_size) {
|
35
|
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
36
|
-
}
|
37
|
-
}
|
38
|
-
|
39
|
-
template <int block_size>
|
40
|
-
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
41
|
-
// blockIdx.x: num_groups idx
|
42
|
-
// threadIdx.x: block_size idx
|
43
|
-
int start = blockIdx.x * group_size;
|
44
|
-
int end = start + group_size;
|
45
|
-
|
46
|
-
start += threadIdx.x;
|
47
|
-
|
48
|
-
if (end >= ne_elements) {
|
49
|
-
end = ne_elements;
|
50
|
-
}
|
51
|
-
|
52
|
-
float tmp = 0.0f; // partial sum for thread in warp
|
53
|
-
|
54
|
-
for (int j = start; j < end; j += block_size) {
|
55
|
-
tmp += x[j];
|
56
|
-
}
|
57
|
-
|
58
|
-
tmp = warp_reduce_sum(tmp);
|
59
|
-
if (block_size > WARP_SIZE) {
|
60
|
-
__shared__ float s_sum[32];
|
61
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
62
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
63
|
-
if (lane_id == 0) {
|
64
|
-
s_sum[warp_id] = tmp;
|
65
|
-
}
|
66
|
-
__syncthreads();
|
67
|
-
tmp = s_sum[lane_id];
|
68
|
-
tmp = warp_reduce_sum(tmp);
|
69
|
-
}
|
70
|
-
|
71
|
-
float mean = tmp / group_size;
|
72
|
-
tmp = 0.0f;
|
73
|
-
|
74
|
-
for (int j = start; j < end; j += block_size) {
|
75
|
-
float xi = x[j] - mean;
|
76
|
-
dst[j] = xi;
|
77
|
-
tmp += xi * xi;
|
78
|
-
}
|
79
|
-
|
80
|
-
tmp = warp_reduce_sum(tmp);
|
81
|
-
if (block_size > WARP_SIZE) {
|
82
|
-
__shared__ float s_sum[32];
|
83
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
84
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
85
|
-
if (lane_id == 0) {
|
86
|
-
s_sum[warp_id] = tmp;
|
87
|
-
}
|
88
|
-
__syncthreads();
|
89
|
-
tmp = s_sum[lane_id];
|
90
|
-
tmp = warp_reduce_sum(tmp);
|
91
|
-
}
|
92
|
-
|
93
|
-
float variance = tmp / group_size;
|
94
|
-
float scale = rsqrtf(variance + eps);
|
95
|
-
for (int j = start; j < end; j += block_size) {
|
96
|
-
dst[j] *= scale;
|
97
|
-
}
|
98
|
-
}
|
99
|
-
|
100
|
-
template <int block_size>
|
101
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
102
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
103
|
-
const int tid = threadIdx.x;
|
104
|
-
|
105
|
-
float tmp = 0.0f; // partial sum for thread in warp
|
106
|
-
|
107
|
-
for (int col = tid; col < ncols; col += block_size) {
|
108
|
-
const float xi = x[row*ncols + col];
|
109
|
-
tmp += xi * xi;
|
110
|
-
}
|
111
|
-
|
112
|
-
// sum up partial sums
|
113
|
-
tmp = warp_reduce_sum(tmp);
|
114
|
-
if (block_size > WARP_SIZE) {
|
115
|
-
__shared__ float s_sum[32];
|
116
|
-
int warp_id = threadIdx.x / WARP_SIZE;
|
117
|
-
int lane_id = threadIdx.x % WARP_SIZE;
|
118
|
-
if (lane_id == 0) {
|
119
|
-
s_sum[warp_id] = tmp;
|
120
|
-
}
|
121
|
-
__syncthreads();
|
122
|
-
tmp = s_sum[lane_id];
|
123
|
-
tmp = warp_reduce_sum(tmp);
|
124
|
-
}
|
125
|
-
|
126
|
-
const float mean = tmp / ncols;
|
127
|
-
const float scale = rsqrtf(mean + eps);
|
128
|
-
|
129
|
-
for (int col = tid; col < ncols; col += block_size) {
|
130
|
-
dst[row*ncols + col] = scale * x[row*ncols + col];
|
131
|
-
}
|
132
|
-
}
|
133
|
-
|
134
|
-
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
135
|
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
136
|
-
if (ncols < 1024) {
|
137
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
138
|
-
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
139
|
-
} else {
|
140
|
-
const dim3 block_dims(1024, 1, 1);
|
141
|
-
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
142
|
-
}
|
143
|
-
}
|
144
|
-
|
145
|
-
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
146
|
-
static const float eps = 1e-6f;
|
147
|
-
if (group_size < 1024) {
|
148
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
149
|
-
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
150
|
-
} else {
|
151
|
-
const dim3 block_dims(1024, 1, 1);
|
152
|
-
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
153
|
-
}
|
154
|
-
}
|
155
|
-
|
156
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
157
|
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
158
|
-
if (ncols < 1024) {
|
159
|
-
const dim3 block_dims(WARP_SIZE, 1, 1);
|
160
|
-
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
161
|
-
} else {
|
162
|
-
const dim3 block_dims(1024, 1, 1);
|
163
|
-
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
164
|
-
}
|
165
|
-
}
|
166
|
-
|
167
|
-
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
168
|
-
const ggml_tensor * src0 = dst->src[0];
|
169
|
-
const float * src0_d = (const float *)src0->data;
|
170
|
-
float * dst_d = (float *)dst->data;
|
171
|
-
cudaStream_t stream = ctx.stream();
|
172
|
-
|
173
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
174
|
-
|
175
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
176
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
177
|
-
|
178
|
-
const int64_t ne00 = src0->ne[0];
|
179
|
-
const int64_t nrows = ggml_nrows(src0);
|
180
|
-
|
181
|
-
float eps;
|
182
|
-
memcpy(&eps, dst->op_params, sizeof(float));
|
183
|
-
|
184
|
-
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
185
|
-
}
|
186
|
-
|
187
|
-
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
188
|
-
const ggml_tensor * src0 = dst->src[0];
|
189
|
-
const float * src0_d = (const float *)src0->data;
|
190
|
-
float * dst_d = (float *)dst->data;
|
191
|
-
cudaStream_t stream = ctx.stream();
|
192
|
-
|
193
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
194
|
-
|
195
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
196
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
197
|
-
|
198
|
-
int num_groups = dst->op_params[0];
|
199
|
-
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
200
|
-
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
|
201
|
-
}
|
202
|
-
|
203
|
-
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
204
|
-
const ggml_tensor * src0 = dst->src[0];
|
205
|
-
const float * src0_d = (const float *)src0->data;
|
206
|
-
float * dst_d = (float *)dst->data;
|
207
|
-
cudaStream_t stream = ctx.stream();
|
208
|
-
|
209
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
210
|
-
|
211
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
212
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
213
|
-
|
214
|
-
const int64_t ne00 = src0->ne[0];
|
215
|
-
const int64_t nrows = ggml_nrows(src0);
|
216
|
-
|
217
|
-
float eps;
|
218
|
-
memcpy(&eps, dst->op_params, sizeof(float));
|
219
|
-
|
220
|
-
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
221
|
-
}
|
@@ -1,49 +0,0 @@
|
|
1
|
-
#include "pad.cuh"
|
2
|
-
|
3
|
-
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
4
|
-
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
5
|
-
// blockIdx.y: idx of ne1
|
6
|
-
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
7
|
-
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
8
|
-
if (nidx >= ne0) {
|
9
|
-
return;
|
10
|
-
}
|
11
|
-
|
12
|
-
// operation
|
13
|
-
int offset_dst =
|
14
|
-
nidx +
|
15
|
-
blockIdx.y * ne0 +
|
16
|
-
blockIdx.z * ne0 * gridDim.y;
|
17
|
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
18
|
-
int offset_src =
|
19
|
-
nidx +
|
20
|
-
blockIdx.y * ne00 +
|
21
|
-
blockIdx.z * ne00 * ne01;
|
22
|
-
dst[offset_dst] = x[offset_src];
|
23
|
-
} else {
|
24
|
-
dst[offset_dst] = 0.0f;
|
25
|
-
}
|
26
|
-
}
|
27
|
-
|
28
|
-
static void pad_f32_cuda(const float * x, float * dst,
|
29
|
-
const int ne00, const int ne01, const int ne02, const int ne03,
|
30
|
-
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
31
|
-
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
32
|
-
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
33
|
-
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
34
|
-
}
|
35
|
-
|
36
|
-
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
37
|
-
const ggml_tensor * src0 = dst->src[0];
|
38
|
-
const float * src0_d = (const float *)src0->data;
|
39
|
-
float * dst_d = (float *)dst->data;
|
40
|
-
cudaStream_t stream = ctx.stream();
|
41
|
-
|
42
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
43
|
-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
44
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
45
|
-
|
46
|
-
pad_f32_cuda(src0_d, dst_d,
|
47
|
-
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
48
|
-
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
49
|
-
}
|
@@ -1,94 +0,0 @@
|
|
1
|
-
#include "pool2d.cuh"
|
2
|
-
|
3
|
-
template <typename Ti, typename To>
|
4
|
-
static __global__ void pool2d_nchw_kernel(
|
5
|
-
const int ih, const int iw, const int oh, const int ow,
|
6
|
-
const int kh, const int kw, const int sh, const int sw,
|
7
|
-
const int ph, const int pw, const int parallel_elements,
|
8
|
-
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
9
|
-
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
10
|
-
if (idx >= parallel_elements) {
|
11
|
-
return;
|
12
|
-
}
|
13
|
-
|
14
|
-
const int I_HW = ih * iw;
|
15
|
-
const int O_HW = oh * ow;
|
16
|
-
const int nc = idx / O_HW;
|
17
|
-
const int cur_oh = idx % O_HW / ow;
|
18
|
-
const int cur_ow = idx % O_HW % ow;
|
19
|
-
const Ti* i_ptr = src + nc * I_HW;
|
20
|
-
To* o_ptr = dst + nc * O_HW;
|
21
|
-
const int start_h = cur_oh * sh - ph;
|
22
|
-
const int bh = max(0, start_h);
|
23
|
-
const int eh = min(ih, start_h + kh);
|
24
|
-
const int start_w = cur_ow * sw - pw;
|
25
|
-
const int bw = max(0, start_w);
|
26
|
-
const int ew = min(iw, start_w + kw);
|
27
|
-
const To scale = 1. / (kh * kw);
|
28
|
-
To res = 0;
|
29
|
-
|
30
|
-
switch (op) {
|
31
|
-
case GGML_OP_POOL_AVG: res = 0; break;
|
32
|
-
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
33
|
-
default: assert(false);
|
34
|
-
}
|
35
|
-
|
36
|
-
for (int i = bh; i < eh; i += 1) {
|
37
|
-
for (int j = bw; j < ew; j += 1) {
|
38
|
-
#if __CUDA_ARCH__ >= 350
|
39
|
-
Ti cur = __ldg(i_ptr + i * iw + j);
|
40
|
-
#else
|
41
|
-
Ti cur = i_ptr[i * iw + j];
|
42
|
-
#endif
|
43
|
-
switch (op) {
|
44
|
-
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
45
|
-
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
46
|
-
default: assert(false);
|
47
|
-
}
|
48
|
-
}
|
49
|
-
}
|
50
|
-
o_ptr[cur_oh * ow + cur_ow] = res;
|
51
|
-
}
|
52
|
-
|
53
|
-
static void pool2d_nchw_kernel_f32_f32_cuda(
|
54
|
-
const int ih, const int iw, const int oh, const int ow,
|
55
|
-
const int kh, const int kw, const int sh, const int sw,
|
56
|
-
const int ph, const int pw, const int parallel_elements,
|
57
|
-
const float * src, float * dst, const enum ggml_op_pool op,
|
58
|
-
cudaStream_t stream) {
|
59
|
-
|
60
|
-
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
61
|
-
dim3 block_nums(num_blocks);
|
62
|
-
pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
|
63
|
-
}
|
64
|
-
|
65
|
-
void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
66
|
-
const ggml_tensor * src0 = dst->src[0];
|
67
|
-
const float * src0_d = (const float *)src0->data;
|
68
|
-
float * dst_d = (float *)dst->data;
|
69
|
-
cudaStream_t stream = ctx.stream();
|
70
|
-
|
71
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
72
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
73
|
-
|
74
|
-
const int32_t * opts = (const int32_t *)dst->op_params;
|
75
|
-
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
76
|
-
const int k0 = opts[1];
|
77
|
-
const int k1 = opts[2];
|
78
|
-
const int s0 = opts[3];
|
79
|
-
const int s1 = opts[4];
|
80
|
-
const int p0 = opts[5];
|
81
|
-
const int p1 = opts[6];
|
82
|
-
|
83
|
-
const int64_t IH = src0->ne[1];
|
84
|
-
const int64_t IW = src0->ne[0];
|
85
|
-
|
86
|
-
const int64_t N = dst->ne[3];
|
87
|
-
const int64_t OC = dst->ne[2];
|
88
|
-
const int64_t OH = dst->ne[1];
|
89
|
-
const int64_t OW = dst->ne[0];
|
90
|
-
|
91
|
-
const int parallel_elements = N * OC * OH * OW;
|
92
|
-
|
93
|
-
pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
|
94
|
-
}
|
@@ -1,112 +0,0 @@
|
|
1
|
-
#include "quantize.cuh"
|
2
|
-
#include <cstdint>
|
3
|
-
|
4
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
5
|
-
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
6
|
-
|
7
|
-
if (ix0 >= kx0_padded) {
|
8
|
-
return;
|
9
|
-
}
|
10
|
-
|
11
|
-
const int64_t ix1 = blockIdx.y;
|
12
|
-
|
13
|
-
const int64_t i_padded = ix1*kx0_padded + ix0;
|
14
|
-
|
15
|
-
block_q8_1 * y = (block_q8_1 *) vy;
|
16
|
-
|
17
|
-
const int64_t ib = i_padded / QK8_1; // block index
|
18
|
-
const int64_t iqs = i_padded % QK8_1; // quant index
|
19
|
-
|
20
|
-
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
|
21
|
-
float amax = fabsf(xi);
|
22
|
-
float sum = xi;
|
23
|
-
|
24
|
-
amax = warp_reduce_max(amax);
|
25
|
-
sum = warp_reduce_sum(sum);
|
26
|
-
|
27
|
-
const float d = amax / 127;
|
28
|
-
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
29
|
-
|
30
|
-
y[ib].qs[iqs] = q;
|
31
|
-
|
32
|
-
if (iqs > 0) {
|
33
|
-
return;
|
34
|
-
}
|
35
|
-
|
36
|
-
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
37
|
-
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
38
|
-
}
|
39
|
-
|
40
|
-
template <bool need_sum>
|
41
|
-
static __global__ void quantize_mmq_q8_1(
|
42
|
-
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
43
|
-
|
44
|
-
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
45
|
-
|
46
|
-
if (ix0 >= kx0_padded) {
|
47
|
-
return;
|
48
|
-
}
|
49
|
-
|
50
|
-
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
51
|
-
|
52
|
-
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
53
|
-
|
54
|
-
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
|
55
|
-
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
56
|
-
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
57
|
-
|
58
|
-
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
|
59
|
-
float amax = fabsf(xi);
|
60
|
-
|
61
|
-
amax = warp_reduce_max(amax);
|
62
|
-
|
63
|
-
float sum;
|
64
|
-
if (need_sum) {
|
65
|
-
sum = warp_reduce_sum(xi);
|
66
|
-
}
|
67
|
-
|
68
|
-
const float d = amax / 127;
|
69
|
-
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
70
|
-
|
71
|
-
y[ib].qs[iqs] = q;
|
72
|
-
|
73
|
-
if (iqs % QK8_1 != 0) {
|
74
|
-
return;
|
75
|
-
}
|
76
|
-
|
77
|
-
if (need_sum) {
|
78
|
-
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
79
|
-
} else {
|
80
|
-
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
81
|
-
}
|
82
|
-
}
|
83
|
-
|
84
|
-
void quantize_row_q8_1_cuda(
|
85
|
-
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
86
|
-
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
87
|
-
|
88
|
-
GGML_ASSERT(kx0_padded % QK8_1 == 0);
|
89
|
-
|
90
|
-
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
91
|
-
const dim3 num_blocks(block_num_x, kx1*channels, 1);
|
92
|
-
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
93
|
-
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
94
|
-
|
95
|
-
GGML_UNUSED(type_x);
|
96
|
-
}
|
97
|
-
|
98
|
-
void quantize_mmq_q8_1_cuda(
|
99
|
-
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
100
|
-
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
101
|
-
|
102
|
-
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
103
|
-
|
104
|
-
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
105
|
-
const dim3 num_blocks(block_num_x, kx1, channels);
|
106
|
-
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
107
|
-
if (mmq_need_sum(type_x)) {
|
108
|
-
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
109
|
-
} else {
|
110
|
-
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
111
|
-
}
|
112
|
-
}
|