llama_cpp 0.15.3 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +4 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
- data/vendor/tmp/llama.cpp/ggml.c +145 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -3
- data/vendor/tmp/llama.cpp/llama.cpp +637 -249
- data/vendor/tmp/llama.cpp/llama.h +11 -5
- metadata +2 -2
@@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6012
6012
|
};
|
6013
6013
|
|
6014
6014
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
6015
|
+
ggml_vk_instance_init();
|
6016
|
+
|
6015
6017
|
#ifdef GGML_VULKAN_DEBUG
|
6016
6018
|
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
6017
6019
|
#endif
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -60,6 +60,9 @@
|
|
60
60
|
|
61
61
|
typedef volatile LONG atomic_int;
|
62
62
|
typedef atomic_int atomic_bool;
|
63
|
+
typedef atomic_int atomic_flag;
|
64
|
+
|
65
|
+
#define ATOMIC_FLAG_INIT 0
|
63
66
|
|
64
67
|
static void atomic_store(atomic_int * ptr, LONG val) {
|
65
68
|
InterlockedExchange(ptr, val);
|
@@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
|
73
76
|
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
74
77
|
return atomic_fetch_add(ptr, -(dec));
|
75
78
|
}
|
79
|
+
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
80
|
+
return InterlockedExchange(ptr, 1);
|
81
|
+
}
|
82
|
+
static void atomic_flag_clear(atomic_flag * ptr) {
|
83
|
+
InterlockedExchange(ptr, 0);
|
84
|
+
}
|
76
85
|
|
77
86
|
typedef HANDLE pthread_t;
|
78
87
|
|
@@ -1567,11 +1576,11 @@ do { \
|
|
1567
1576
|
|
1568
1577
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
1569
1578
|
|
1570
|
-
#define GGML_F32Cx8
|
1579
|
+
#define GGML_F32Cx8 __m256
|
1571
1580
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
1572
1581
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
1573
1582
|
|
1574
|
-
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
1583
|
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
1575
1584
|
float tmp[8];
|
1576
1585
|
|
1577
1586
|
for (int i = 0; i < 8; i++) {
|
@@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
|
1580
1589
|
|
1581
1590
|
return (__m256)__lasx_xvld(tmp, 0);
|
1582
1591
|
}
|
1583
|
-
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
1592
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
1584
1593
|
float arr[8];
|
1585
1594
|
|
1586
1595
|
__lasx_xvst(y, arr, 0);
|
1587
1596
|
|
1588
|
-
for (int i = 0; i < 8; i++)
|
1597
|
+
for (int i = 0; i < 8; i++) {
|
1589
1598
|
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1599
|
+
}
|
1590
1600
|
}
|
1591
1601
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
1592
1602
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
@@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1662
1672
|
#define GGML_F16_STEP 32
|
1663
1673
|
#define GGML_F16_EPR 4
|
1664
1674
|
|
1665
|
-
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
1675
|
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
1666
1676
|
float tmp[4];
|
1667
1677
|
|
1668
1678
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
@@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
|
1673
1683
|
return __lsx_vld(tmp, 0);
|
1674
1684
|
}
|
1675
1685
|
|
1676
|
-
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1686
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
1677
1687
|
float arr[4];
|
1678
1688
|
|
1679
1689
|
__lsx_vst(y, arr, 0);
|
@@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
|
|
2306
2316
|
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2307
2317
|
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2308
2318
|
const __m512 n = _mm512_sub_ps(z, r);
|
2309
|
-
const __m512 b =
|
2310
|
-
|
2311
|
-
|
2312
|
-
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2313
|
-
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2314
|
-
const __m512 u = _mm512_mul_ps(b, b);
|
2315
|
-
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2316
|
-
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2317
|
-
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2318
|
-
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2319
|
-
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2320
|
-
if (_mm512_kortestz(c, c))
|
2321
|
-
return _mm512_fmadd_ps(j, k, k);
|
2322
|
-
const __m512i g = _mm512_and_si512(
|
2323
|
-
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2324
|
-
_mm512_set1_epi32(0x82000000u));
|
2325
|
-
const __m512 s1 =
|
2326
|
-
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2327
|
-
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2319
|
+
const __m512 b =
|
2320
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2321
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2328
2322
|
const __mmask16 d =
|
2329
2323
|
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2324
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2325
|
+
const __m512 j = _mm512_fmadd_ps(
|
2326
|
+
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2327
|
+
_mm512_set1_ps(0x1.573e2ep-5f)),
|
2328
|
+
u,
|
2329
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2330
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2331
|
+
u,
|
2332
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
|
2333
|
+
const __m512 res = _mm512_scalef_ps(j, n);
|
2334
|
+
if (_mm512_kortestz(d, d))
|
2335
|
+
return res;
|
2336
|
+
const __m512 zero = _mm512_setzero_ps();
|
2337
|
+
const __m512 alt = _mm512_mask_blend_ps(
|
2338
|
+
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
|
2339
|
+
return _mm512_mask_blend_ps(d, res, alt);
|
2335
2340
|
}
|
2336
2341
|
|
2337
2342
|
// computes silu x/(1+exp(-x)) in single precision vector
|
@@ -2883,24 +2888,20 @@ struct ggml_state {
|
|
2883
2888
|
|
2884
2889
|
// global state
|
2885
2890
|
static struct ggml_state g_state;
|
2886
|
-
static
|
2891
|
+
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
2887
2892
|
|
2888
2893
|
// barrier via spin lock
|
2889
2894
|
inline static void ggml_critical_section_start(void) {
|
2890
|
-
|
2891
|
-
|
2892
|
-
|
2893
|
-
// wait for other threads to finish
|
2894
|
-
atomic_fetch_sub(&g_state_barrier, 1);
|
2895
|
-
sched_yield(); // TODO: reconsider this
|
2896
|
-
processing = atomic_fetch_add(&g_state_barrier, 1);
|
2895
|
+
while (atomic_flag_test_and_set(&g_state_critical)) {
|
2896
|
+
// spin
|
2897
|
+
sched_yield();
|
2897
2898
|
}
|
2898
2899
|
}
|
2899
2900
|
|
2900
2901
|
// TODO: make this somehow automatically executed
|
2901
2902
|
// some sort of "sentry" mechanism
|
2902
2903
|
inline static void ggml_critical_section_end(void) {
|
2903
|
-
|
2904
|
+
atomic_flag_clear(&g_state_critical);
|
2904
2905
|
}
|
2905
2906
|
|
2906
2907
|
#if defined(__gnu_linux__)
|
@@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3216
3217
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3217
3218
|
}
|
3218
3219
|
|
3219
|
-
|
3220
|
+
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3221
|
+
return ggml_is_contiguous(tensor);
|
3222
|
+
}
|
3223
|
+
|
3224
|
+
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3220
3225
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3221
3226
|
|
3222
3227
|
return
|
@@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
3225
3230
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3226
3231
|
}
|
3227
3232
|
|
3233
|
+
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3234
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3235
|
+
|
3236
|
+
return
|
3237
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3238
|
+
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3239
|
+
}
|
3240
|
+
|
3228
3241
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
3229
3242
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3230
3243
|
|
@@ -4882,10 +4895,21 @@ struct ggml_tensor * ggml_repeat_back(
|
|
4882
4895
|
// ggml_concat
|
4883
4896
|
|
4884
4897
|
struct ggml_tensor * ggml_concat(
|
4885
|
-
struct ggml_context* ctx,
|
4886
|
-
struct ggml_tensor* a,
|
4887
|
-
struct ggml_tensor* b
|
4888
|
-
|
4898
|
+
struct ggml_context * ctx,
|
4899
|
+
struct ggml_tensor * a,
|
4900
|
+
struct ggml_tensor * b,
|
4901
|
+
int dim) {
|
4902
|
+
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
4903
|
+
|
4904
|
+
int64_t ne[GGML_MAX_DIMS];
|
4905
|
+
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
4906
|
+
if (d == dim) {
|
4907
|
+
ne[d] = a->ne[d] + b->ne[d];
|
4908
|
+
continue;
|
4909
|
+
}
|
4910
|
+
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
4911
|
+
ne[d] = a->ne[d];
|
4912
|
+
}
|
4889
4913
|
|
4890
4914
|
bool is_node = false;
|
4891
4915
|
|
@@ -4893,7 +4917,9 @@ struct ggml_tensor * ggml_concat(
|
|
4893
4917
|
is_node = true;
|
4894
4918
|
}
|
4895
4919
|
|
4896
|
-
struct ggml_tensor * result =
|
4920
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
4921
|
+
|
4922
|
+
ggml_set_op_params_i32(result, 0, dim);
|
4897
4923
|
|
4898
4924
|
result->op = GGML_OP_CONCAT;
|
4899
4925
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5013,6 +5039,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
5013
5039
|
}
|
5014
5040
|
|
5015
5041
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5042
|
+
|
5016
5043
|
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
5017
5044
|
|
5018
5045
|
result->op = GGML_OP_LEAKY_RELU;
|
@@ -6378,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6378
6405
|
);
|
6379
6406
|
}
|
6380
6407
|
|
6408
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
6409
|
+
struct ggml_context * ctx,
|
6410
|
+
struct ggml_tensor * a,
|
6411
|
+
struct ggml_tensor * b,
|
6412
|
+
int n_dims,
|
6413
|
+
float base,
|
6414
|
+
bool down) {
|
6415
|
+
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
6416
|
+
}
|
6417
|
+
|
6381
6418
|
// ggml_rope_back
|
6382
6419
|
|
6383
6420
|
struct ggml_tensor * ggml_rope_back(
|
@@ -10967,26 +11004,29 @@ static void ggml_compute_forward_concat_f32(
|
|
10967
11004
|
GGML_ASSERT(nb00 == sizeof(float));
|
10968
11005
|
GGML_ASSERT(nb10 == sizeof(float));
|
10969
11006
|
|
11007
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
11008
|
+
|
11009
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
11010
|
+
|
11011
|
+
int64_t o[4] = {0, 0, 0, 0};
|
11012
|
+
o[dim] = src0->ne[dim];
|
11013
|
+
|
11014
|
+
const float * x;
|
11015
|
+
|
11016
|
+
// TODO: smarter multi-theading
|
10970
11017
|
for (int i3 = 0; i3 < ne3; i3++) {
|
10971
11018
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
10972
|
-
|
10973
|
-
for (int
|
10974
|
-
|
10975
|
-
|
10976
|
-
|
10977
|
-
|
10978
|
-
*y = *x;
|
10979
|
-
}
|
10980
|
-
}
|
10981
|
-
} // src1
|
10982
|
-
else {
|
10983
|
-
for (int i1 = 0; i1 < ne1; i1++) {
|
10984
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
10985
|
-
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10986
|
-
|
10987
|
-
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10988
|
-
*y = *x;
|
11019
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
11020
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
11021
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
11022
|
+
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
11023
|
+
} else {
|
11024
|
+
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
10989
11025
|
}
|
11026
|
+
|
11027
|
+
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
11028
|
+
|
11029
|
+
*y = *x;
|
10990
11030
|
}
|
10991
11031
|
}
|
10992
11032
|
}
|
@@ -10994,8 +11034,8 @@ static void ggml_compute_forward_concat_f32(
|
|
10994
11034
|
}
|
10995
11035
|
|
10996
11036
|
static void ggml_compute_forward_concat(
|
10997
|
-
const struct ggml_compute_params* params,
|
10998
|
-
struct ggml_tensor* dst) {
|
11037
|
+
const struct ggml_compute_params * params,
|
11038
|
+
struct ggml_tensor * dst) {
|
10999
11039
|
|
11000
11040
|
const struct ggml_tensor * src0 = dst->src[0];
|
11001
11041
|
|
@@ -11388,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
|
|
11388
11428
|
|
11389
11429
|
const struct ggml_tensor * src0 = dst->src[0];
|
11390
11430
|
|
11391
|
-
GGML_ASSERT(
|
11392
|
-
GGML_ASSERT(
|
11431
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11432
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11393
11433
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11394
11434
|
|
11395
11435
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11451,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11451
11491
|
|
11452
11492
|
const struct ggml_tensor * src0 = dst->src[0];
|
11453
11493
|
|
11454
|
-
GGML_ASSERT(
|
11455
|
-
GGML_ASSERT(
|
11494
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11495
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11456
11496
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11457
11497
|
|
11458
11498
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11514,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
|
|
11514
11554
|
|
11515
11555
|
const struct ggml_tensor * src0 = dst->src[0];
|
11516
11556
|
|
11517
|
-
GGML_ASSERT(
|
11518
|
-
GGML_ASSERT(
|
11557
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11558
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11519
11559
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11520
11560
|
|
11521
11561
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11626,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11626
11666
|
const struct ggml_tensor * src0 = dst->src[0];
|
11627
11667
|
const struct ggml_tensor * grad = dst->src[1];
|
11628
11668
|
|
11629
|
-
GGML_ASSERT(
|
11630
|
-
GGML_ASSERT(
|
11631
|
-
GGML_ASSERT(
|
11669
|
+
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
11670
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11671
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11632
11672
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11633
11673
|
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
11634
11674
|
|
@@ -14326,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14326
14366
|
int ir = 0;
|
14327
14367
|
|
14328
14368
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14329
|
-
|
14369
|
+
|
14330
14370
|
float corr_dims[2];
|
14331
14371
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
14332
14372
|
|
@@ -14375,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14375
14415
|
const float cos_block_theta = cosf(block_theta);
|
14376
14416
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14377
14417
|
|
14378
|
-
theta_base
|
14418
|
+
theta_base *= theta_scale;
|
14379
14419
|
block_theta *= theta_scale;
|
14380
14420
|
|
14381
14421
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -14410,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
|
|
14410
14450
|
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
14411
14451
|
}
|
14412
14452
|
} else {
|
14413
|
-
//
|
14414
|
-
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
14415
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14416
|
-
theta_base *= freq_scale;
|
14453
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14417
14454
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14418
14455
|
if (ic < n_dims) {
|
14419
|
-
const int64_t
|
14456
|
+
const int64_t i0 = ic/2;
|
14420
14457
|
|
14421
|
-
|
14422
|
-
float cur_rot = inv_ndims * ic - ib;
|
14423
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14458
|
+
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
14424
14459
|
|
14425
14460
|
float cos_theta, sin_theta;
|
14426
14461
|
rope_yarn(
|
14427
|
-
theta_base/freq_factor, freq_scale, corr_dims,
|
14462
|
+
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14428
14463
|
&cos_theta, &sin_theta
|
14429
14464
|
);
|
14430
|
-
sin_theta *= sin_sign;
|
14431
14465
|
|
14466
|
+
sin_theta *= sin_sign;
|
14432
14467
|
theta_base *= theta_scale;
|
14433
14468
|
|
14434
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
14435
|
-
|
14436
14469
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14437
14470
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14438
14471
|
|
@@ -14511,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14511
14544
|
int ir = 0;
|
14512
14545
|
|
14513
14546
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14514
|
-
|
14547
|
+
|
14515
14548
|
float corr_dims[2];
|
14516
14549
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
14517
14550
|
|
@@ -14560,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14560
14593
|
const float cos_block_theta = cosf(block_theta);
|
14561
14594
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14562
14595
|
|
14563
|
-
theta_base
|
14596
|
+
theta_base *= theta_scale;
|
14564
14597
|
block_theta *= theta_scale;
|
14565
14598
|
|
14566
14599
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -14591,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
|
|
14591
14624
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14592
14625
|
}
|
14593
14626
|
} else {
|
14594
|
-
//
|
14595
|
-
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
14596
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14597
|
-
theta_base *= freq_scale;
|
14627
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14598
14628
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14599
14629
|
if (ic < n_dims) {
|
14600
|
-
const int64_t
|
14630
|
+
const int64_t i0 = ic/2;
|
14601
14631
|
|
14602
|
-
|
14603
|
-
float cur_rot = inv_ndims * ic - ib;
|
14604
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14632
|
+
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
14605
14633
|
|
14606
14634
|
float cos_theta, sin_theta;
|
14607
14635
|
rope_yarn(
|
14608
|
-
theta_base/freq_factor, freq_scale, corr_dims,
|
14636
|
+
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14609
14637
|
&cos_theta, &sin_theta
|
14610
14638
|
);
|
14611
|
-
sin_theta *= sin_sign;
|
14612
14639
|
|
14640
|
+
sin_theta *= sin_sign;
|
14613
14641
|
theta_base *= theta_scale;
|
14614
14642
|
|
14615
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
14616
|
-
|
14617
14643
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14618
14644
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14619
14645
|
|
@@ -22742,6 +22768,16 @@ int ggml_cpu_has_neon(void) {
|
|
22742
22768
|
#endif
|
22743
22769
|
}
|
22744
22770
|
|
22771
|
+
int ggml_cpu_has_sve(void) {
|
22772
|
+
#if defined(__ARM_FEATURE_SVE)
|
22773
|
+
// TODO: Currently, SVE 256 bit is only supported.
|
22774
|
+
GGML_ASSERT(svcntb() == QK8_0);
|
22775
|
+
return 1;
|
22776
|
+
#else
|
22777
|
+
return 0;
|
22778
|
+
#endif
|
22779
|
+
}
|
22780
|
+
|
22745
22781
|
int ggml_cpu_has_arm_fma(void) {
|
22746
22782
|
#if defined(__ARM_FEATURE_FMA)
|
22747
22783
|
return 1;
|
@@ -22830,6 +22866,14 @@ int ggml_cpu_has_sycl(void) {
|
|
22830
22866
|
#endif
|
22831
22867
|
}
|
22832
22868
|
|
22869
|
+
int ggml_cpu_has_rpc(void) {
|
22870
|
+
#if defined(GGML_USE_RPC)
|
22871
|
+
return 1;
|
22872
|
+
#else
|
22873
|
+
return 0;
|
22874
|
+
#endif
|
22875
|
+
}
|
22876
|
+
|
22833
22877
|
int ggml_cpu_has_gpublas(void) {
|
22834
22878
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
22835
22879
|
ggml_cpu_has_sycl();
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -756,7 +756,6 @@ extern "C" {
|
|
756
756
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
757
757
|
|
758
758
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
759
|
-
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
760
759
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
761
760
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
762
761
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
@@ -765,6 +764,11 @@ extern "C" {
|
|
765
764
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
766
765
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
767
766
|
|
767
|
+
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
768
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
769
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
770
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
771
|
+
|
768
772
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
769
773
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
770
774
|
|
@@ -1007,12 +1011,13 @@ extern "C" {
|
|
1007
1011
|
struct ggml_tensor * a,
|
1008
1012
|
struct ggml_tensor * b);
|
1009
1013
|
|
1010
|
-
// concat a and b
|
1014
|
+
// concat a and b along dim
|
1011
1015
|
// used in stable-diffusion
|
1012
1016
|
GGML_API struct ggml_tensor * ggml_concat(
|
1013
1017
|
struct ggml_context * ctx,
|
1014
1018
|
struct ggml_tensor * a,
|
1015
|
-
struct ggml_tensor * b
|
1019
|
+
struct ggml_tensor * b,
|
1020
|
+
int dim);
|
1016
1021
|
|
1017
1022
|
GGML_API struct ggml_tensor * ggml_abs(
|
1018
1023
|
struct ggml_context * ctx,
|
@@ -1547,6 +1552,14 @@ extern "C" {
|
|
1547
1552
|
float beta_slow),
|
1548
1553
|
"use ggml_rope_ext_inplace instead");
|
1549
1554
|
|
1555
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
1556
|
+
struct ggml_context * ctx,
|
1557
|
+
struct ggml_tensor * a,
|
1558
|
+
struct ggml_tensor * b,
|
1559
|
+
int n_dims,
|
1560
|
+
float base,
|
1561
|
+
bool down);
|
1562
|
+
|
1550
1563
|
// compute correction dims for YaRN RoPE scaling
|
1551
1564
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1552
1565
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
@@ -2404,6 +2417,7 @@ extern "C" {
|
|
2404
2417
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2405
2418
|
GGML_API int ggml_cpu_has_fma (void);
|
2406
2419
|
GGML_API int ggml_cpu_has_neon (void);
|
2420
|
+
GGML_API int ggml_cpu_has_sve (void);
|
2407
2421
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
2408
2422
|
GGML_API int ggml_cpu_has_metal (void);
|
2409
2423
|
GGML_API int ggml_cpu_has_f16c (void);
|
@@ -2418,6 +2432,7 @@ extern "C" {
|
|
2418
2432
|
GGML_API int ggml_cpu_has_sse3 (void);
|
2419
2433
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2420
2434
|
GGML_API int ggml_cpu_has_sycl (void);
|
2435
|
+
GGML_API int ggml_cpu_has_rpc (void);
|
2421
2436
|
GGML_API int ggml_cpu_has_vsx (void);
|
2422
2437
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2423
2438
|
|