llama_cpp 0.15.3 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +4 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
- data/vendor/tmp/llama.cpp/ggml.c +145 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -3
- data/vendor/tmp/llama.cpp/llama.cpp +637 -249
- data/vendor/tmp/llama.cpp/llama.h +11 -5
- metadata +2 -2
@@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6012
6012
|
};
|
6013
6013
|
|
6014
6014
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
6015
|
+
ggml_vk_instance_init();
|
6016
|
+
|
6015
6017
|
#ifdef GGML_VULKAN_DEBUG
|
6016
6018
|
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
6017
6019
|
#endif
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -60,6 +60,9 @@
|
|
60
60
|
|
61
61
|
typedef volatile LONG atomic_int;
|
62
62
|
typedef atomic_int atomic_bool;
|
63
|
+
typedef atomic_int atomic_flag;
|
64
|
+
|
65
|
+
#define ATOMIC_FLAG_INIT 0
|
63
66
|
|
64
67
|
static void atomic_store(atomic_int * ptr, LONG val) {
|
65
68
|
InterlockedExchange(ptr, val);
|
@@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
|
73
76
|
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
74
77
|
return atomic_fetch_add(ptr, -(dec));
|
75
78
|
}
|
79
|
+
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
80
|
+
return InterlockedExchange(ptr, 1);
|
81
|
+
}
|
82
|
+
static void atomic_flag_clear(atomic_flag * ptr) {
|
83
|
+
InterlockedExchange(ptr, 0);
|
84
|
+
}
|
76
85
|
|
77
86
|
typedef HANDLE pthread_t;
|
78
87
|
|
@@ -1567,11 +1576,11 @@ do { \
|
|
1567
1576
|
|
1568
1577
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
1569
1578
|
|
1570
|
-
#define GGML_F32Cx8
|
1579
|
+
#define GGML_F32Cx8 __m256
|
1571
1580
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
1572
1581
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
1573
1582
|
|
1574
|
-
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
1583
|
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
1575
1584
|
float tmp[8];
|
1576
1585
|
|
1577
1586
|
for (int i = 0; i < 8; i++) {
|
@@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
|
1580
1589
|
|
1581
1590
|
return (__m256)__lasx_xvld(tmp, 0);
|
1582
1591
|
}
|
1583
|
-
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
1592
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
1584
1593
|
float arr[8];
|
1585
1594
|
|
1586
1595
|
__lasx_xvst(y, arr, 0);
|
1587
1596
|
|
1588
|
-
for (int i = 0; i < 8; i++)
|
1597
|
+
for (int i = 0; i < 8; i++) {
|
1589
1598
|
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1599
|
+
}
|
1590
1600
|
}
|
1591
1601
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
1592
1602
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
@@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1662
1672
|
#define GGML_F16_STEP 32
|
1663
1673
|
#define GGML_F16_EPR 4
|
1664
1674
|
|
1665
|
-
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
1675
|
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
1666
1676
|
float tmp[4];
|
1667
1677
|
|
1668
1678
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
@@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
|
1673
1683
|
return __lsx_vld(tmp, 0);
|
1674
1684
|
}
|
1675
1685
|
|
1676
|
-
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1686
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
1677
1687
|
float arr[4];
|
1678
1688
|
|
1679
1689
|
__lsx_vst(y, arr, 0);
|
@@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
|
|
2306
2316
|
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2307
2317
|
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2308
2318
|
const __m512 n = _mm512_sub_ps(z, r);
|
2309
|
-
const __m512 b =
|
2310
|
-
|
2311
|
-
|
2312
|
-
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2313
|
-
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2314
|
-
const __m512 u = _mm512_mul_ps(b, b);
|
2315
|
-
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2316
|
-
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2317
|
-
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2318
|
-
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2319
|
-
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2320
|
-
if (_mm512_kortestz(c, c))
|
2321
|
-
return _mm512_fmadd_ps(j, k, k);
|
2322
|
-
const __m512i g = _mm512_and_si512(
|
2323
|
-
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2324
|
-
_mm512_set1_epi32(0x82000000u));
|
2325
|
-
const __m512 s1 =
|
2326
|
-
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2327
|
-
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2319
|
+
const __m512 b =
|
2320
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2321
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2328
2322
|
const __mmask16 d =
|
2329
2323
|
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2324
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2325
|
+
const __m512 j = _mm512_fmadd_ps(
|
2326
|
+
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2327
|
+
_mm512_set1_ps(0x1.573e2ep-5f)),
|
2328
|
+
u,
|
2329
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2330
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2331
|
+
u,
|
2332
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
|
2333
|
+
const __m512 res = _mm512_scalef_ps(j, n);
|
2334
|
+
if (_mm512_kortestz(d, d))
|
2335
|
+
return res;
|
2336
|
+
const __m512 zero = _mm512_setzero_ps();
|
2337
|
+
const __m512 alt = _mm512_mask_blend_ps(
|
2338
|
+
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
|
2339
|
+
return _mm512_mask_blend_ps(d, res, alt);
|
2335
2340
|
}
|
2336
2341
|
|
2337
2342
|
// computes silu x/(1+exp(-x)) in single precision vector
|
@@ -2883,24 +2888,20 @@ struct ggml_state {
|
|
2883
2888
|
|
2884
2889
|
// global state
|
2885
2890
|
static struct ggml_state g_state;
|
2886
|
-
static
|
2891
|
+
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
2887
2892
|
|
2888
2893
|
// barrier via spin lock
|
2889
2894
|
inline static void ggml_critical_section_start(void) {
|
2890
|
-
|
2891
|
-
|
2892
|
-
|
2893
|
-
// wait for other threads to finish
|
2894
|
-
atomic_fetch_sub(&g_state_barrier, 1);
|
2895
|
-
sched_yield(); // TODO: reconsider this
|
2896
|
-
processing = atomic_fetch_add(&g_state_barrier, 1);
|
2895
|
+
while (atomic_flag_test_and_set(&g_state_critical)) {
|
2896
|
+
// spin
|
2897
|
+
sched_yield();
|
2897
2898
|
}
|
2898
2899
|
}
|
2899
2900
|
|
2900
2901
|
// TODO: make this somehow automatically executed
|
2901
2902
|
// some sort of "sentry" mechanism
|
2902
2903
|
inline static void ggml_critical_section_end(void) {
|
2903
|
-
|
2904
|
+
atomic_flag_clear(&g_state_critical);
|
2904
2905
|
}
|
2905
2906
|
|
2906
2907
|
#if defined(__gnu_linux__)
|
@@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3216
3217
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3217
3218
|
}
|
3218
3219
|
|
3219
|
-
|
3220
|
+
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3221
|
+
return ggml_is_contiguous(tensor);
|
3222
|
+
}
|
3223
|
+
|
3224
|
+
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3220
3225
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3221
3226
|
|
3222
3227
|
return
|
@@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
3225
3230
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3226
3231
|
}
|
3227
3232
|
|
3233
|
+
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3234
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3235
|
+
|
3236
|
+
return
|
3237
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3238
|
+
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3239
|
+
}
|
3240
|
+
|
3228
3241
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
3229
3242
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3230
3243
|
|
@@ -4882,10 +4895,21 @@ struct ggml_tensor * ggml_repeat_back(
|
|
4882
4895
|
// ggml_concat
|
4883
4896
|
|
4884
4897
|
struct ggml_tensor * ggml_concat(
|
4885
|
-
struct ggml_context* ctx,
|
4886
|
-
struct ggml_tensor* a,
|
4887
|
-
struct ggml_tensor* b
|
4888
|
-
|
4898
|
+
struct ggml_context * ctx,
|
4899
|
+
struct ggml_tensor * a,
|
4900
|
+
struct ggml_tensor * b,
|
4901
|
+
int dim) {
|
4902
|
+
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
4903
|
+
|
4904
|
+
int64_t ne[GGML_MAX_DIMS];
|
4905
|
+
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
4906
|
+
if (d == dim) {
|
4907
|
+
ne[d] = a->ne[d] + b->ne[d];
|
4908
|
+
continue;
|
4909
|
+
}
|
4910
|
+
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
4911
|
+
ne[d] = a->ne[d];
|
4912
|
+
}
|
4889
4913
|
|
4890
4914
|
bool is_node = false;
|
4891
4915
|
|
@@ -4893,7 +4917,9 @@ struct ggml_tensor * ggml_concat(
|
|
4893
4917
|
is_node = true;
|
4894
4918
|
}
|
4895
4919
|
|
4896
|
-
struct ggml_tensor * result =
|
4920
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
4921
|
+
|
4922
|
+
ggml_set_op_params_i32(result, 0, dim);
|
4897
4923
|
|
4898
4924
|
result->op = GGML_OP_CONCAT;
|
4899
4925
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5013,6 +5039,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
5013
5039
|
}
|
5014
5040
|
|
5015
5041
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5042
|
+
|
5016
5043
|
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
5017
5044
|
|
5018
5045
|
result->op = GGML_OP_LEAKY_RELU;
|
@@ -6378,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6378
6405
|
);
|
6379
6406
|
}
|
6380
6407
|
|
6408
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
6409
|
+
struct ggml_context * ctx,
|
6410
|
+
struct ggml_tensor * a,
|
6411
|
+
struct ggml_tensor * b,
|
6412
|
+
int n_dims,
|
6413
|
+
float base,
|
6414
|
+
bool down) {
|
6415
|
+
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
6416
|
+
}
|
6417
|
+
|
6381
6418
|
// ggml_rope_back
|
6382
6419
|
|
6383
6420
|
struct ggml_tensor * ggml_rope_back(
|
@@ -10967,26 +11004,29 @@ static void ggml_compute_forward_concat_f32(
|
|
10967
11004
|
GGML_ASSERT(nb00 == sizeof(float));
|
10968
11005
|
GGML_ASSERT(nb10 == sizeof(float));
|
10969
11006
|
|
11007
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
11008
|
+
|
11009
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
11010
|
+
|
11011
|
+
int64_t o[4] = {0, 0, 0, 0};
|
11012
|
+
o[dim] = src0->ne[dim];
|
11013
|
+
|
11014
|
+
const float * x;
|
11015
|
+
|
11016
|
+
// TODO: smarter multi-theading
|
10970
11017
|
for (int i3 = 0; i3 < ne3; i3++) {
|
10971
11018
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
10972
|
-
|
10973
|
-
for (int
|
10974
|
-
|
10975
|
-
|
10976
|
-
|
10977
|
-
|
10978
|
-
*y = *x;
|
10979
|
-
}
|
10980
|
-
}
|
10981
|
-
} // src1
|
10982
|
-
else {
|
10983
|
-
for (int i1 = 0; i1 < ne1; i1++) {
|
10984
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
10985
|
-
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10986
|
-
|
10987
|
-
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10988
|
-
*y = *x;
|
11019
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
11020
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
11021
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
11022
|
+
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
11023
|
+
} else {
|
11024
|
+
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
10989
11025
|
}
|
11026
|
+
|
11027
|
+
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
11028
|
+
|
11029
|
+
*y = *x;
|
10990
11030
|
}
|
10991
11031
|
}
|
10992
11032
|
}
|
@@ -10994,8 +11034,8 @@ static void ggml_compute_forward_concat_f32(
|
|
10994
11034
|
}
|
10995
11035
|
|
10996
11036
|
static void ggml_compute_forward_concat(
|
10997
|
-
const struct ggml_compute_params* params,
|
10998
|
-
struct ggml_tensor* dst) {
|
11037
|
+
const struct ggml_compute_params * params,
|
11038
|
+
struct ggml_tensor * dst) {
|
10999
11039
|
|
11000
11040
|
const struct ggml_tensor * src0 = dst->src[0];
|
11001
11041
|
|
@@ -11388,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
|
|
11388
11428
|
|
11389
11429
|
const struct ggml_tensor * src0 = dst->src[0];
|
11390
11430
|
|
11391
|
-
GGML_ASSERT(
|
11392
|
-
GGML_ASSERT(
|
11431
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11432
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11393
11433
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11394
11434
|
|
11395
11435
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11451,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11451
11491
|
|
11452
11492
|
const struct ggml_tensor * src0 = dst->src[0];
|
11453
11493
|
|
11454
|
-
GGML_ASSERT(
|
11455
|
-
GGML_ASSERT(
|
11494
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11495
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11456
11496
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11457
11497
|
|
11458
11498
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11514,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
|
|
11514
11554
|
|
11515
11555
|
const struct ggml_tensor * src0 = dst->src[0];
|
11516
11556
|
|
11517
|
-
GGML_ASSERT(
|
11518
|
-
GGML_ASSERT(
|
11557
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11558
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11519
11559
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11520
11560
|
|
11521
11561
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11626,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11626
11666
|
const struct ggml_tensor * src0 = dst->src[0];
|
11627
11667
|
const struct ggml_tensor * grad = dst->src[1];
|
11628
11668
|
|
11629
|
-
GGML_ASSERT(
|
11630
|
-
GGML_ASSERT(
|
11631
|
-
GGML_ASSERT(
|
11669
|
+
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
11670
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11671
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11632
11672
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11633
11673
|
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
11634
11674
|
|
@@ -14326,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14326
14366
|
int ir = 0;
|
14327
14367
|
|
14328
14368
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14329
|
-
|
14369
|
+
|
14330
14370
|
float corr_dims[2];
|
14331
14371
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
14332
14372
|
|
@@ -14375,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14375
14415
|
const float cos_block_theta = cosf(block_theta);
|
14376
14416
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14377
14417
|
|
14378
|
-
theta_base
|
14418
|
+
theta_base *= theta_scale;
|
14379
14419
|
block_theta *= theta_scale;
|
14380
14420
|
|
14381
14421
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -14410,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
|
|
14410
14450
|
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
14411
14451
|
}
|
14412
14452
|
} else {
|
14413
|
-
//
|
14414
|
-
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
14415
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14416
|
-
theta_base *= freq_scale;
|
14453
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14417
14454
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14418
14455
|
if (ic < n_dims) {
|
14419
|
-
const int64_t
|
14456
|
+
const int64_t i0 = ic/2;
|
14420
14457
|
|
14421
|
-
|
14422
|
-
float cur_rot = inv_ndims * ic - ib;
|
14423
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14458
|
+
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
14424
14459
|
|
14425
14460
|
float cos_theta, sin_theta;
|
14426
14461
|
rope_yarn(
|
14427
|
-
theta_base/freq_factor, freq_scale, corr_dims,
|
14462
|
+
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14428
14463
|
&cos_theta, &sin_theta
|
14429
14464
|
);
|
14430
|
-
sin_theta *= sin_sign;
|
14431
14465
|
|
14466
|
+
sin_theta *= sin_sign;
|
14432
14467
|
theta_base *= theta_scale;
|
14433
14468
|
|
14434
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
14435
|
-
|
14436
14469
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14437
14470
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14438
14471
|
|
@@ -14511,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14511
14544
|
int ir = 0;
|
14512
14545
|
|
14513
14546
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14514
|
-
|
14547
|
+
|
14515
14548
|
float corr_dims[2];
|
14516
14549
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
14517
14550
|
|
@@ -14560,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14560
14593
|
const float cos_block_theta = cosf(block_theta);
|
14561
14594
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14562
14595
|
|
14563
|
-
theta_base
|
14596
|
+
theta_base *= theta_scale;
|
14564
14597
|
block_theta *= theta_scale;
|
14565
14598
|
|
14566
14599
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -14591,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
|
|
14591
14624
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14592
14625
|
}
|
14593
14626
|
} else {
|
14594
|
-
//
|
14595
|
-
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
14596
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14597
|
-
theta_base *= freq_scale;
|
14627
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14598
14628
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14599
14629
|
if (ic < n_dims) {
|
14600
|
-
const int64_t
|
14630
|
+
const int64_t i0 = ic/2;
|
14601
14631
|
|
14602
|
-
|
14603
|
-
float cur_rot = inv_ndims * ic - ib;
|
14604
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14632
|
+
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
14605
14633
|
|
14606
14634
|
float cos_theta, sin_theta;
|
14607
14635
|
rope_yarn(
|
14608
|
-
theta_base/freq_factor, freq_scale, corr_dims,
|
14636
|
+
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14609
14637
|
&cos_theta, &sin_theta
|
14610
14638
|
);
|
14611
|
-
sin_theta *= sin_sign;
|
14612
14639
|
|
14640
|
+
sin_theta *= sin_sign;
|
14613
14641
|
theta_base *= theta_scale;
|
14614
14642
|
|
14615
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
14616
|
-
|
14617
14643
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14618
14644
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14619
14645
|
|
@@ -22742,6 +22768,16 @@ int ggml_cpu_has_neon(void) {
|
|
22742
22768
|
#endif
|
22743
22769
|
}
|
22744
22770
|
|
22771
|
+
int ggml_cpu_has_sve(void) {
|
22772
|
+
#if defined(__ARM_FEATURE_SVE)
|
22773
|
+
// TODO: Currently, SVE 256 bit is only supported.
|
22774
|
+
GGML_ASSERT(svcntb() == QK8_0);
|
22775
|
+
return 1;
|
22776
|
+
#else
|
22777
|
+
return 0;
|
22778
|
+
#endif
|
22779
|
+
}
|
22780
|
+
|
22745
22781
|
int ggml_cpu_has_arm_fma(void) {
|
22746
22782
|
#if defined(__ARM_FEATURE_FMA)
|
22747
22783
|
return 1;
|
@@ -22830,6 +22866,14 @@ int ggml_cpu_has_sycl(void) {
|
|
22830
22866
|
#endif
|
22831
22867
|
}
|
22832
22868
|
|
22869
|
+
int ggml_cpu_has_rpc(void) {
|
22870
|
+
#if defined(GGML_USE_RPC)
|
22871
|
+
return 1;
|
22872
|
+
#else
|
22873
|
+
return 0;
|
22874
|
+
#endif
|
22875
|
+
}
|
22876
|
+
|
22833
22877
|
int ggml_cpu_has_gpublas(void) {
|
22834
22878
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
22835
22879
|
ggml_cpu_has_sycl();
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -756,7 +756,6 @@ extern "C" {
|
|
756
756
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
757
757
|
|
758
758
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
759
|
-
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
760
759
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
761
760
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
762
761
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
@@ -765,6 +764,11 @@ extern "C" {
|
|
765
764
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
766
765
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
767
766
|
|
767
|
+
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
768
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
769
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
770
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
771
|
+
|
768
772
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
769
773
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
770
774
|
|
@@ -1007,12 +1011,13 @@ extern "C" {
|
|
1007
1011
|
struct ggml_tensor * a,
|
1008
1012
|
struct ggml_tensor * b);
|
1009
1013
|
|
1010
|
-
// concat a and b
|
1014
|
+
// concat a and b along dim
|
1011
1015
|
// used in stable-diffusion
|
1012
1016
|
GGML_API struct ggml_tensor * ggml_concat(
|
1013
1017
|
struct ggml_context * ctx,
|
1014
1018
|
struct ggml_tensor * a,
|
1015
|
-
struct ggml_tensor * b
|
1019
|
+
struct ggml_tensor * b,
|
1020
|
+
int dim);
|
1016
1021
|
|
1017
1022
|
GGML_API struct ggml_tensor * ggml_abs(
|
1018
1023
|
struct ggml_context * ctx,
|
@@ -1547,6 +1552,14 @@ extern "C" {
|
|
1547
1552
|
float beta_slow),
|
1548
1553
|
"use ggml_rope_ext_inplace instead");
|
1549
1554
|
|
1555
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
1556
|
+
struct ggml_context * ctx,
|
1557
|
+
struct ggml_tensor * a,
|
1558
|
+
struct ggml_tensor * b,
|
1559
|
+
int n_dims,
|
1560
|
+
float base,
|
1561
|
+
bool down);
|
1562
|
+
|
1550
1563
|
// compute correction dims for YaRN RoPE scaling
|
1551
1564
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1552
1565
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
@@ -2404,6 +2417,7 @@ extern "C" {
|
|
2404
2417
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2405
2418
|
GGML_API int ggml_cpu_has_fma (void);
|
2406
2419
|
GGML_API int ggml_cpu_has_neon (void);
|
2420
|
+
GGML_API int ggml_cpu_has_sve (void);
|
2407
2421
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
2408
2422
|
GGML_API int ggml_cpu_has_metal (void);
|
2409
2423
|
GGML_API int ggml_cpu_has_f16c (void);
|
@@ -2418,6 +2432,7 @@ extern "C" {
|
|
2418
2432
|
GGML_API int ggml_cpu_has_sse3 (void);
|
2419
2433
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2420
2434
|
GGML_API int ggml_cpu_has_sycl (void);
|
2435
|
+
GGML_API int ggml_cpu_has_rpc (void);
|
2421
2436
|
GGML_API int ggml_cpu_has_vsx (void);
|
2422
2437
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2423
2438
|
|