llama_cpp 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ // GGML internal header
6
+
7
+ #include <assert.h>
8
+ #include <stddef.h>
9
+ #include <stdbool.h>
10
+ #include <string.h> // memcpy
11
+ #include <math.h> // fabsf
12
+
13
+ #ifdef __cplusplus
14
+ extern "C" {
15
+ #endif
16
+
17
+ // static_assert should be a #define, but if it's not,
18
+ // fall back to the _Static_assert C11 keyword.
19
+ // if C99 - static_assert is noop
20
+ // ref: https://stackoverflow.com/a/53923785/4039976
21
+ #ifndef static_assert
22
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
23
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
24
+ #else
25
+ #define static_assert(cond, msg) struct global_scope_noop_trick
26
+ #endif
27
+ #endif
28
+
29
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
30
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
31
+ #ifndef __FMA__
32
+ #define __FMA__
33
+ #endif
34
+ #ifndef __F16C__
35
+ #define __F16C__
36
+ #endif
37
+ #ifndef __SSE3__
38
+ #define __SSE3__
39
+ #endif
40
+ #endif
41
+
42
+ #undef MIN
43
+ #undef MAX
44
+
45
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
46
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
47
+
48
+ // 16-bit float
49
+ // on Arm, we use __fp16
50
+ // on x86, we use uint16_t
51
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
52
+
53
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
54
+ //
55
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
56
+ //
57
+ #include <arm_neon.h>
58
+
59
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
60
+ #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
61
+
62
+ #define GGML_FP16_TO_FP32(x) ((float) (x))
63
+ #define GGML_FP32_TO_FP16(x) (x)
64
+
65
+ #else
66
+
67
+ #ifdef __wasm_simd128__
68
+ #include <wasm_simd128.h>
69
+ #else
70
+ #ifdef __POWER9_VECTOR__
71
+ #include <altivec.h>
72
+ #undef bool
73
+ #define bool _Bool
74
+ #else
75
+ #if defined(_MSC_VER) || defined(__MINGW32__)
76
+ #include <intrin.h>
77
+ #else
78
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
79
+ #if !defined(__riscv)
80
+ #include <immintrin.h>
81
+ #endif
82
+ #endif
83
+ #endif
84
+ #endif
85
+ #endif
86
+
87
+ #ifdef __riscv_v_intrinsic
88
+ #include <riscv_vector.h>
89
+ #endif
90
+
91
+ #ifdef __F16C__
92
+
93
+ #ifdef _MSC_VER
94
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
95
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
96
+ #else
97
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
98
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
99
+ #endif
100
+
101
+ #elif defined(__POWER9_VECTOR__)
102
+
103
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
104
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
105
+ /* the inline asm below is about 12% faster than the lookup method */
106
+ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
107
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
108
+
109
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
110
+ register float f;
111
+ register double d;
112
+ __asm__(
113
+ "mtfprd %0,%2\n"
114
+ "xscvhpdp %0,%0\n"
115
+ "frsp %1,%0\n" :
116
+ /* temp */ "=d"(d),
117
+ /* out */ "=f"(f):
118
+ /* in */ "r"(h));
119
+ return f;
120
+ }
121
+
122
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
123
+ register double d;
124
+ register ggml_fp16_t r;
125
+ __asm__( /* xscvdphp can work on double or single precision */
126
+ "xscvdphp %0,%2\n"
127
+ "mffprd %1,%0\n" :
128
+ /* temp */ "=d"(d),
129
+ /* out */ "=r"(r):
130
+ /* in */ "f"(f));
131
+ return r;
132
+ }
133
+
134
+ #else
135
+
136
+ // FP16 <-> FP32
137
+ // ref: https://github.com/Maratyszcza/FP16
138
+
139
+ static inline float fp32_from_bits(uint32_t w) {
140
+ union {
141
+ uint32_t as_bits;
142
+ float as_value;
143
+ } fp32;
144
+ fp32.as_bits = w;
145
+ return fp32.as_value;
146
+ }
147
+
148
+ static inline uint32_t fp32_to_bits(float f) {
149
+ union {
150
+ float as_value;
151
+ uint32_t as_bits;
152
+ } fp32;
153
+ fp32.as_value = f;
154
+ return fp32.as_bits;
155
+ }
156
+
157
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
158
+ const uint32_t w = (uint32_t) h << 16;
159
+ const uint32_t sign = w & UINT32_C(0x80000000);
160
+ const uint32_t two_w = w + w;
161
+
162
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
163
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
164
+ const float exp_scale = 0x1.0p-112f;
165
+ #else
166
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
167
+ #endif
168
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
169
+
170
+ const uint32_t magic_mask = UINT32_C(126) << 23;
171
+ const float magic_bias = 0.5f;
172
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
173
+
174
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
175
+ const uint32_t result = sign |
176
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
177
+ return fp32_from_bits(result);
178
+ }
179
+
180
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
181
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
182
+ const float scale_to_inf = 0x1.0p+112f;
183
+ const float scale_to_zero = 0x1.0p-110f;
184
+ #else
185
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
186
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
187
+ #endif
188
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
189
+
190
+ const uint32_t w = fp32_to_bits(f);
191
+ const uint32_t shl1_w = w + w;
192
+ const uint32_t sign = w & UINT32_C(0x80000000);
193
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
194
+ if (bias < UINT32_C(0x71000000)) {
195
+ bias = UINT32_C(0x71000000);
196
+ }
197
+
198
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
199
+ const uint32_t bits = fp32_to_bits(base);
200
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
201
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
202
+ const uint32_t nonsign = exp_bits + mantissa_bits;
203
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
204
+ }
205
+
206
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
207
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
208
+
209
+ #endif // __F16C__
210
+
211
+ #endif // __ARM_NEON
212
+
213
+ // precomputed f32 table for f16 (256 KB)
214
+ // defined in ggml.c, initialized in ggml_init()
215
+ extern float ggml_table_f32_f16[1 << 16];
216
+
217
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
218
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
219
+ // This is also true for POWER9.
220
+ #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
221
+
222
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
223
+ uint16_t s;
224
+ memcpy(&s, &f, sizeof(uint16_t));
225
+ return ggml_table_f32_f16[s];
226
+ }
227
+
228
+ #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
229
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
230
+
231
+ #endif
232
+
233
+ // TODO: backend v2 PR
234
+
235
+ #ifdef __cplusplus
236
+ }
237
+ #endif
@@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
210
210
  GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
211
211
 
212
212
  NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
213
+ if (sourcePath == nil) {
214
+ GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
215
+ sourcePath = @"ggml-metal.metal";
216
+ }
213
217
  GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
214
218
  NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
215
219
  if (error) {
@@ -234,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
234
238
  // load kernels
235
239
  {
236
240
  NSError * error = nil;
237
- #define GGML_METAL_ADD_KERNEL(name) \
238
- ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
239
- ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
241
+
242
+ /*
240
243
  GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
241
244
  (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
242
245
  (int) ctx->pipeline_##name.threadExecutionWidth); \
246
+ */
247
+ #define GGML_METAL_ADD_KERNEL(name) \
248
+ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
249
+ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
243
250
  if (error) { \
244
- GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
251
+ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
245
252
  return NULL; \
246
253
  }
247
254
 
@@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
994
1001
  } break;
995
1002
  case GGML_OP_SOFT_MAX:
996
1003
  {
997
- const int nth = MIN(32, ne00);
1004
+ int nth = 32; // SIMD width
998
1005
 
999
1006
  if (ne00%4 == 0) {
1000
1007
  [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
1001
1008
  } else {
1009
+ do {
1010
+ nth *= 2;
1011
+ } while (nth <= ne00 && nth <= 1024);
1012
+ nth /= 2;
1002
1013
  [encoder setComputePipelineState:ctx->pipeline_soft_max];
1003
1014
  }
1004
1015
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
1006
1017
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1007
1018
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1008
1019
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1020
+ [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
1009
1021
 
1010
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1022
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1011
1023
  } break;
1012
1024
  case GGML_OP_DIAG_MASK_INF:
1013
1025
  {
@@ -1388,14 +1400,18 @@ void ggml_metal_graph_compute(
1388
1400
 
1389
1401
  const int nth = MIN(1024, ne00);
1390
1402
 
1391
- const int n_past = ((int32_t *) dst->op_params)[0];
1392
- const int n_dims = ((int32_t *) dst->op_params)[1];
1393
- const int mode = ((int32_t *) dst->op_params)[2];
1403
+ const int n_past = ((int32_t *) dst->op_params)[0];
1404
+ const int n_dims = ((int32_t *) dst->op_params)[1];
1405
+ const int mode = ((int32_t *) dst->op_params)[2];
1406
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
1394
1407
 
1395
- float freq_base;
1396
- float freq_scale;
1397
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
1398
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
1408
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1409
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1410
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
1411
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
1412
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
1413
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1414
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1399
1415
 
1400
1416
  switch (src0->type) {
1401
1417
  case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@@ -1403,30 +1419,35 @@ void ggml_metal_graph_compute(
1403
1419
  default: GGML_ASSERT(false);
1404
1420
  };
1405
1421
 
1406
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1407
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1408
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1409
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1410
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
1411
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
1412
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
1413
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
1414
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
1415
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
1416
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
1417
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
1418
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
1419
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
1420
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
1421
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
1422
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
1423
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
1424
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
1425
- [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
1426
- [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
1427
- [encoder setBytes:&mode length:sizeof( int) atIndex:21];
1428
- [encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
1429
- [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
1422
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1423
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1424
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1425
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1426
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
1427
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
1428
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
1429
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
1430
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
1431
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
1432
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
1433
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
1434
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
1435
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
1436
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
1437
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
1438
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
1439
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
1440
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
1441
+ [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
1442
+ [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
1443
+ [encoder setBytes:&mode length:sizeof( int) atIndex:21];
1444
+ [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
1445
+ [encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
1446
+ [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
1447
+ [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
1448
+ [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
1449
+ [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
1450
+ [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
1430
1451
 
1431
1452
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1432
1453
  } break;