llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -0,0 +1,237 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
// GGML internal header
|
6
|
+
|
7
|
+
#include <assert.h>
|
8
|
+
#include <stddef.h>
|
9
|
+
#include <stdbool.h>
|
10
|
+
#include <string.h> // memcpy
|
11
|
+
#include <math.h> // fabsf
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
extern "C" {
|
15
|
+
#endif
|
16
|
+
|
17
|
+
// static_assert should be a #define, but if it's not,
|
18
|
+
// fall back to the _Static_assert C11 keyword.
|
19
|
+
// if C99 - static_assert is noop
|
20
|
+
// ref: https://stackoverflow.com/a/53923785/4039976
|
21
|
+
#ifndef static_assert
|
22
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
23
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
24
|
+
#else
|
25
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
26
|
+
#endif
|
27
|
+
#endif
|
28
|
+
|
29
|
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
30
|
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
31
|
+
#ifndef __FMA__
|
32
|
+
#define __FMA__
|
33
|
+
#endif
|
34
|
+
#ifndef __F16C__
|
35
|
+
#define __F16C__
|
36
|
+
#endif
|
37
|
+
#ifndef __SSE3__
|
38
|
+
#define __SSE3__
|
39
|
+
#endif
|
40
|
+
#endif
|
41
|
+
|
42
|
+
#undef MIN
|
43
|
+
#undef MAX
|
44
|
+
|
45
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
46
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
47
|
+
|
48
|
+
// 16-bit float
|
49
|
+
// on Arm, we use __fp16
|
50
|
+
// on x86, we use uint16_t
|
51
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
52
|
+
|
53
|
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
54
|
+
//
|
55
|
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
56
|
+
//
|
57
|
+
#include <arm_neon.h>
|
58
|
+
|
59
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
60
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
61
|
+
|
62
|
+
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
63
|
+
#define GGML_FP32_TO_FP16(x) (x)
|
64
|
+
|
65
|
+
#else
|
66
|
+
|
67
|
+
#ifdef __wasm_simd128__
|
68
|
+
#include <wasm_simd128.h>
|
69
|
+
#else
|
70
|
+
#ifdef __POWER9_VECTOR__
|
71
|
+
#include <altivec.h>
|
72
|
+
#undef bool
|
73
|
+
#define bool _Bool
|
74
|
+
#else
|
75
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
76
|
+
#include <intrin.h>
|
77
|
+
#else
|
78
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
79
|
+
#if !defined(__riscv)
|
80
|
+
#include <immintrin.h>
|
81
|
+
#endif
|
82
|
+
#endif
|
83
|
+
#endif
|
84
|
+
#endif
|
85
|
+
#endif
|
86
|
+
|
87
|
+
#ifdef __riscv_v_intrinsic
|
88
|
+
#include <riscv_vector.h>
|
89
|
+
#endif
|
90
|
+
|
91
|
+
#ifdef __F16C__
|
92
|
+
|
93
|
+
#ifdef _MSC_VER
|
94
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
95
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
96
|
+
#else
|
97
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
98
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
99
|
+
#endif
|
100
|
+
|
101
|
+
#elif defined(__POWER9_VECTOR__)
|
102
|
+
|
103
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
104
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
105
|
+
/* the inline asm below is about 12% faster than the lookup method */
|
106
|
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
107
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
108
|
+
|
109
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
110
|
+
register float f;
|
111
|
+
register double d;
|
112
|
+
__asm__(
|
113
|
+
"mtfprd %0,%2\n"
|
114
|
+
"xscvhpdp %0,%0\n"
|
115
|
+
"frsp %1,%0\n" :
|
116
|
+
/* temp */ "=d"(d),
|
117
|
+
/* out */ "=f"(f):
|
118
|
+
/* in */ "r"(h));
|
119
|
+
return f;
|
120
|
+
}
|
121
|
+
|
122
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
123
|
+
register double d;
|
124
|
+
register ggml_fp16_t r;
|
125
|
+
__asm__( /* xscvdphp can work on double or single precision */
|
126
|
+
"xscvdphp %0,%2\n"
|
127
|
+
"mffprd %1,%0\n" :
|
128
|
+
/* temp */ "=d"(d),
|
129
|
+
/* out */ "=r"(r):
|
130
|
+
/* in */ "f"(f));
|
131
|
+
return r;
|
132
|
+
}
|
133
|
+
|
134
|
+
#else
|
135
|
+
|
136
|
+
// FP16 <-> FP32
|
137
|
+
// ref: https://github.com/Maratyszcza/FP16
|
138
|
+
|
139
|
+
static inline float fp32_from_bits(uint32_t w) {
|
140
|
+
union {
|
141
|
+
uint32_t as_bits;
|
142
|
+
float as_value;
|
143
|
+
} fp32;
|
144
|
+
fp32.as_bits = w;
|
145
|
+
return fp32.as_value;
|
146
|
+
}
|
147
|
+
|
148
|
+
static inline uint32_t fp32_to_bits(float f) {
|
149
|
+
union {
|
150
|
+
float as_value;
|
151
|
+
uint32_t as_bits;
|
152
|
+
} fp32;
|
153
|
+
fp32.as_value = f;
|
154
|
+
return fp32.as_bits;
|
155
|
+
}
|
156
|
+
|
157
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
158
|
+
const uint32_t w = (uint32_t) h << 16;
|
159
|
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
160
|
+
const uint32_t two_w = w + w;
|
161
|
+
|
162
|
+
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
163
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
164
|
+
const float exp_scale = 0x1.0p-112f;
|
165
|
+
#else
|
166
|
+
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
167
|
+
#endif
|
168
|
+
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
169
|
+
|
170
|
+
const uint32_t magic_mask = UINT32_C(126) << 23;
|
171
|
+
const float magic_bias = 0.5f;
|
172
|
+
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
173
|
+
|
174
|
+
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
175
|
+
const uint32_t result = sign |
|
176
|
+
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
177
|
+
return fp32_from_bits(result);
|
178
|
+
}
|
179
|
+
|
180
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
181
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
182
|
+
const float scale_to_inf = 0x1.0p+112f;
|
183
|
+
const float scale_to_zero = 0x1.0p-110f;
|
184
|
+
#else
|
185
|
+
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
186
|
+
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
187
|
+
#endif
|
188
|
+
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
189
|
+
|
190
|
+
const uint32_t w = fp32_to_bits(f);
|
191
|
+
const uint32_t shl1_w = w + w;
|
192
|
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
193
|
+
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
194
|
+
if (bias < UINT32_C(0x71000000)) {
|
195
|
+
bias = UINT32_C(0x71000000);
|
196
|
+
}
|
197
|
+
|
198
|
+
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
199
|
+
const uint32_t bits = fp32_to_bits(base);
|
200
|
+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
201
|
+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
202
|
+
const uint32_t nonsign = exp_bits + mantissa_bits;
|
203
|
+
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
204
|
+
}
|
205
|
+
|
206
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
207
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
208
|
+
|
209
|
+
#endif // __F16C__
|
210
|
+
|
211
|
+
#endif // __ARM_NEON
|
212
|
+
|
213
|
+
// precomputed f32 table for f16 (256 KB)
|
214
|
+
// defined in ggml.c, initialized in ggml_init()
|
215
|
+
extern float ggml_table_f32_f16[1 << 16];
|
216
|
+
|
217
|
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
218
|
+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
219
|
+
// This is also true for POWER9.
|
220
|
+
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
221
|
+
|
222
|
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
223
|
+
uint16_t s;
|
224
|
+
memcpy(&s, &f, sizeof(uint16_t));
|
225
|
+
return ggml_table_f32_f16[s];
|
226
|
+
}
|
227
|
+
|
228
|
+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
229
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
230
|
+
|
231
|
+
#endif
|
232
|
+
|
233
|
+
// TODO: backend v2 PR
|
234
|
+
|
235
|
+
#ifdef __cplusplus
|
236
|
+
}
|
237
|
+
#endif
|
@@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
210
210
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
211
211
|
|
212
212
|
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
213
|
+
if (sourcePath == nil) {
|
214
|
+
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
215
|
+
sourcePath = @"ggml-metal.metal";
|
216
|
+
}
|
213
217
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
214
218
|
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
215
219
|
if (error) {
|
@@ -234,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
234
238
|
// load kernels
|
235
239
|
{
|
236
240
|
NSError * error = nil;
|
237
|
-
|
238
|
-
|
239
|
-
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
241
|
+
|
242
|
+
/*
|
240
243
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
241
244
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
242
245
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
246
|
+
*/
|
247
|
+
#define GGML_METAL_ADD_KERNEL(name) \
|
248
|
+
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
249
|
+
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
243
250
|
if (error) { \
|
244
|
-
|
251
|
+
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
245
252
|
return NULL; \
|
246
253
|
}
|
247
254
|
|
@@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
|
|
994
1001
|
} break;
|
995
1002
|
case GGML_OP_SOFT_MAX:
|
996
1003
|
{
|
997
|
-
|
1004
|
+
int nth = 32; // SIMD width
|
998
1005
|
|
999
1006
|
if (ne00%4 == 0) {
|
1000
1007
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
1001
1008
|
} else {
|
1009
|
+
do {
|
1010
|
+
nth *= 2;
|
1011
|
+
} while (nth <= ne00 && nth <= 1024);
|
1012
|
+
nth /= 2;
|
1002
1013
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
1003
1014
|
}
|
1004
1015
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
|
|
1006
1017
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
1007
1018
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
1008
1019
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
1020
|
+
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
1009
1021
|
|
1010
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
1022
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1011
1023
|
} break;
|
1012
1024
|
case GGML_OP_DIAG_MASK_INF:
|
1013
1025
|
{
|
@@ -1388,14 +1400,18 @@ void ggml_metal_graph_compute(
|
|
1388
1400
|
|
1389
1401
|
const int nth = MIN(1024, ne00);
|
1390
1402
|
|
1391
|
-
const int n_past
|
1392
|
-
const int n_dims
|
1393
|
-
const int mode
|
1403
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
1404
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1405
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
1406
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
|
1394
1407
|
|
1395
|
-
float freq_base;
|
1396
|
-
float
|
1397
|
-
memcpy(&
|
1398
|
-
memcpy(&
|
1408
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1409
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
1410
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
1411
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
1412
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
1413
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
1414
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
1399
1415
|
|
1400
1416
|
switch (src0->type) {
|
1401
1417
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
@@ -1403,30 +1419,35 @@ void ggml_metal_graph_compute(
|
|
1403
1419
|
default: GGML_ASSERT(false);
|
1404
1420
|
};
|
1405
1421
|
|
1406
|
-
[encoder setBuffer:id_src0
|
1407
|
-
[encoder setBuffer:id_src1
|
1408
|
-
[encoder setBuffer:id_dst
|
1409
|
-
[encoder setBytes:&ne00
|
1410
|
-
[encoder setBytes:&ne01
|
1411
|
-
[encoder setBytes:&ne02
|
1412
|
-
[encoder setBytes:&ne03
|
1413
|
-
[encoder setBytes:&nb00
|
1414
|
-
[encoder setBytes:&nb01
|
1415
|
-
[encoder setBytes:&nb02
|
1416
|
-
[encoder setBytes:&nb03
|
1417
|
-
[encoder setBytes:&ne0
|
1418
|
-
[encoder setBytes:&ne1
|
1419
|
-
[encoder setBytes:&ne2
|
1420
|
-
[encoder setBytes:&ne3
|
1421
|
-
[encoder setBytes:&nb0
|
1422
|
-
[encoder setBytes:&nb1
|
1423
|
-
[encoder setBytes:&nb2
|
1424
|
-
[encoder setBytes:&nb3
|
1425
|
-
[encoder setBytes:&n_past
|
1426
|
-
[encoder setBytes:&n_dims
|
1427
|
-
[encoder setBytes:&mode
|
1428
|
-
[encoder setBytes:&
|
1429
|
-
[encoder setBytes:&
|
1422
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1423
|
+
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1424
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
1425
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
1426
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
1427
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
1428
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
1429
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
1430
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
1431
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
1432
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
1433
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
1434
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
1435
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
1436
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
1437
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
1438
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
1439
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
1440
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
1441
|
+
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
1442
|
+
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
1443
|
+
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
1444
|
+
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
1445
|
+
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
1446
|
+
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
1447
|
+
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
1448
|
+
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
1449
|
+
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
1450
|
+
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
1430
1451
|
|
1431
1452
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1432
1453
|
} break;
|