llama_cpp 0.9.0 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,7 +17,12 @@ extern "C" {
17
17
 
18
18
  #define GGML_CUDA_MAX_DEVICES 16
19
19
 
20
+ // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
20
21
  GGML_API void ggml_init_cublas(void);
22
+
23
+ // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
+ GGML_API bool ggml_cublas_loaded(void);
25
+
21
26
  GGML_API void * ggml_cuda_host_malloc(size_t size);
22
27
  GGML_API void ggml_cuda_host_free(void * ptr);
23
28
 
@@ -0,0 +1,237 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ // GGML internal header
6
+
7
+ #include <assert.h>
8
+ #include <stddef.h>
9
+ #include <stdbool.h>
10
+ #include <string.h> // memcpy
11
+ #include <math.h> // fabsf
12
+
13
+ #ifdef __cplusplus
14
+ extern "C" {
15
+ #endif
16
+
17
+ // static_assert should be a #define, but if it's not,
18
+ // fall back to the _Static_assert C11 keyword.
19
+ // if C99 - static_assert is noop
20
+ // ref: https://stackoverflow.com/a/53923785/4039976
21
+ #ifndef static_assert
22
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
23
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
24
+ #else
25
+ #define static_assert(cond, msg) struct global_scope_noop_trick
26
+ #endif
27
+ #endif
28
+
29
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
30
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
31
+ #ifndef __FMA__
32
+ #define __FMA__
33
+ #endif
34
+ #ifndef __F16C__
35
+ #define __F16C__
36
+ #endif
37
+ #ifndef __SSE3__
38
+ #define __SSE3__
39
+ #endif
40
+ #endif
41
+
42
+ #undef MIN
43
+ #undef MAX
44
+
45
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
46
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
47
+
48
+ // 16-bit float
49
+ // on Arm, we use __fp16
50
+ // on x86, we use uint16_t
51
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
52
+
53
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
54
+ //
55
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
56
+ //
57
+ #include <arm_neon.h>
58
+
59
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
60
+ #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
61
+
62
+ #define GGML_FP16_TO_FP32(x) ((float) (x))
63
+ #define GGML_FP32_TO_FP16(x) (x)
64
+
65
+ #else
66
+
67
+ #ifdef __wasm_simd128__
68
+ #include <wasm_simd128.h>
69
+ #else
70
+ #ifdef __POWER9_VECTOR__
71
+ #include <altivec.h>
72
+ #undef bool
73
+ #define bool _Bool
74
+ #else
75
+ #if defined(_MSC_VER) || defined(__MINGW32__)
76
+ #include <intrin.h>
77
+ #else
78
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
79
+ #if !defined(__riscv)
80
+ #include <immintrin.h>
81
+ #endif
82
+ #endif
83
+ #endif
84
+ #endif
85
+ #endif
86
+
87
+ #ifdef __riscv_v_intrinsic
88
+ #include <riscv_vector.h>
89
+ #endif
90
+
91
+ #ifdef __F16C__
92
+
93
+ #ifdef _MSC_VER
94
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
95
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
96
+ #else
97
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
98
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
99
+ #endif
100
+
101
+ #elif defined(__POWER9_VECTOR__)
102
+
103
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
104
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
105
+ /* the inline asm below is about 12% faster than the lookup method */
106
+ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
107
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
108
+
109
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
110
+ register float f;
111
+ register double d;
112
+ __asm__(
113
+ "mtfprd %0,%2\n"
114
+ "xscvhpdp %0,%0\n"
115
+ "frsp %1,%0\n" :
116
+ /* temp */ "=d"(d),
117
+ /* out */ "=f"(f):
118
+ /* in */ "r"(h));
119
+ return f;
120
+ }
121
+
122
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
123
+ register double d;
124
+ register ggml_fp16_t r;
125
+ __asm__( /* xscvdphp can work on double or single precision */
126
+ "xscvdphp %0,%2\n"
127
+ "mffprd %1,%0\n" :
128
+ /* temp */ "=d"(d),
129
+ /* out */ "=r"(r):
130
+ /* in */ "f"(f));
131
+ return r;
132
+ }
133
+
134
+ #else
135
+
136
+ // FP16 <-> FP32
137
+ // ref: https://github.com/Maratyszcza/FP16
138
+
139
+ static inline float fp32_from_bits(uint32_t w) {
140
+ union {
141
+ uint32_t as_bits;
142
+ float as_value;
143
+ } fp32;
144
+ fp32.as_bits = w;
145
+ return fp32.as_value;
146
+ }
147
+
148
+ static inline uint32_t fp32_to_bits(float f) {
149
+ union {
150
+ float as_value;
151
+ uint32_t as_bits;
152
+ } fp32;
153
+ fp32.as_value = f;
154
+ return fp32.as_bits;
155
+ }
156
+
157
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
158
+ const uint32_t w = (uint32_t) h << 16;
159
+ const uint32_t sign = w & UINT32_C(0x80000000);
160
+ const uint32_t two_w = w + w;
161
+
162
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
163
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
164
+ const float exp_scale = 0x1.0p-112f;
165
+ #else
166
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
167
+ #endif
168
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
169
+
170
+ const uint32_t magic_mask = UINT32_C(126) << 23;
171
+ const float magic_bias = 0.5f;
172
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
173
+
174
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
175
+ const uint32_t result = sign |
176
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
177
+ return fp32_from_bits(result);
178
+ }
179
+
180
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
181
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
182
+ const float scale_to_inf = 0x1.0p+112f;
183
+ const float scale_to_zero = 0x1.0p-110f;
184
+ #else
185
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
186
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
187
+ #endif
188
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
189
+
190
+ const uint32_t w = fp32_to_bits(f);
191
+ const uint32_t shl1_w = w + w;
192
+ const uint32_t sign = w & UINT32_C(0x80000000);
193
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
194
+ if (bias < UINT32_C(0x71000000)) {
195
+ bias = UINT32_C(0x71000000);
196
+ }
197
+
198
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
199
+ const uint32_t bits = fp32_to_bits(base);
200
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
201
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
202
+ const uint32_t nonsign = exp_bits + mantissa_bits;
203
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
204
+ }
205
+
206
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
207
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
208
+
209
+ #endif // __F16C__
210
+
211
+ #endif // __ARM_NEON
212
+
213
+ // precomputed f32 table for f16 (256 KB)
214
+ // defined in ggml.c, initialized in ggml_init()
215
+ extern float ggml_table_f32_f16[1 << 16];
216
+
217
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
218
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
219
+ // This is also true for POWER9.
220
+ #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
221
+
222
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
223
+ uint16_t s;
224
+ memcpy(&s, &f, sizeof(uint16_t));
225
+ return ggml_table_f32_f16[s];
226
+ }
227
+
228
+ #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
229
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
230
+
231
+ #endif
232
+
233
+ // TODO: backend v2 PR
234
+
235
+ #ifdef __cplusplus
236
+ }
237
+ #endif
@@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
210
210
  GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
211
211
 
212
212
  NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
213
+ if (sourcePath == nil) {
214
+ GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
215
+ sourcePath = @"ggml-metal.metal";
216
+ }
213
217
  GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
214
218
  NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
215
219
  if (error) {
@@ -234,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
234
238
  // load kernels
235
239
  {
236
240
  NSError * error = nil;
237
- #define GGML_METAL_ADD_KERNEL(name) \
238
- ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
239
- ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
241
+
242
+ /*
240
243
  GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
241
244
  (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
242
245
  (int) ctx->pipeline_##name.threadExecutionWidth); \
246
+ */
247
+ #define GGML_METAL_ADD_KERNEL(name) \
248
+ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
249
+ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
243
250
  if (error) { \
244
- GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
251
+ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
245
252
  return NULL; \
246
253
  }
247
254
 
@@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
994
1001
  } break;
995
1002
  case GGML_OP_SOFT_MAX:
996
1003
  {
997
- const int nth = MIN(32, ne00);
1004
+ int nth = 32; // SIMD width
998
1005
 
999
1006
  if (ne00%4 == 0) {
1000
1007
  [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
1001
1008
  } else {
1009
+ do {
1010
+ nth *= 2;
1011
+ } while (nth <= ne00 && nth <= 1024);
1012
+ nth /= 2;
1002
1013
  [encoder setComputePipelineState:ctx->pipeline_soft_max];
1003
1014
  }
1004
1015
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
1006
1017
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1007
1018
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1008
1019
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1020
+ [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
1009
1021
 
1010
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1022
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1011
1023
  } break;
1012
1024
  case GGML_OP_DIAG_MASK_INF:
1013
1025
  {
@@ -1336,7 +1348,7 @@ void ggml_metal_graph_compute(
1336
1348
  [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1337
1349
  [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1338
1350
  [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1339
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
1351
+ [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
1340
1352
 
1341
1353
  const int64_t nrows = ggml_nrows(src0);
1342
1354
 
@@ -1388,14 +1400,19 @@ void ggml_metal_graph_compute(
1388
1400
 
1389
1401
  const int nth = MIN(1024, ne00);
1390
1402
 
1391
- const int n_past = ((int32_t *) dst->op_params)[0];
1392
- const int n_dims = ((int32_t *) dst->op_params)[1];
1393
- const int mode = ((int32_t *) dst->op_params)[2];
1403
+ const int n_past = ((int32_t *) dst->op_params)[0];
1404
+ const int n_dims = ((int32_t *) dst->op_params)[1];
1405
+ const int mode = ((int32_t *) dst->op_params)[2];
1406
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
1407
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1394
1408
 
1395
- float freq_base;
1396
- float freq_scale;
1397
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
1398
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
1409
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1410
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1411
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
1412
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
1413
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
1414
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1415
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1399
1416
 
1400
1417
  switch (src0->type) {
1401
1418
  case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@@ -1403,30 +1420,35 @@ void ggml_metal_graph_compute(
1403
1420
  default: GGML_ASSERT(false);
1404
1421
  };
1405
1422
 
1406
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1407
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1408
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1409
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1410
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
1411
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
1412
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
1413
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
1414
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
1415
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
1416
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
1417
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
1418
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
1419
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
1420
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
1421
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
1422
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
1423
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
1424
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
1425
- [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
1426
- [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
1427
- [encoder setBytes:&mode length:sizeof( int) atIndex:21];
1428
- [encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
1429
- [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
1423
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1424
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1425
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1426
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1427
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
1428
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
1429
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
1430
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
1431
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
1432
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
1433
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
1434
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
1435
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
1436
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
1437
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
1438
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
1439
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
1440
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
1441
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
1442
+ [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
1443
+ [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
1444
+ [encoder setBytes:&mode length:sizeof( int) atIndex:21];
1445
+ [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
1446
+ [encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
1447
+ [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
1448
+ [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
1449
+ [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
1450
+ [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
1451
+ [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
1430
1452
 
1431
1453
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1432
1454
  } break;