llama_cpp 0.15.3 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +6 -0
 - data/ext/llama_cpp/llama_cpp.cpp +12 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +2 -0
 - data/vendor/tmp/llama.cpp/Makefile +4 -1
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
 - data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
 - data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
 - data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
 - data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
 - data/vendor/tmp/llama.cpp/ggml.c +145 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -3
 - data/vendor/tmp/llama.cpp/llama.cpp +637 -249
 - data/vendor/tmp/llama.cpp/llama.h +11 -5
 - metadata +2 -2
 
| 
         @@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { 
     | 
|
| 
       6012 
6012 
     | 
    
         
             
            };
         
     | 
| 
       6013 
6013 
     | 
    
         | 
| 
       6014 
6014 
     | 
    
         
             
            GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
         
     | 
| 
      
 6015 
     | 
    
         
            +
                ggml_vk_instance_init();
         
     | 
| 
      
 6016 
     | 
    
         
            +
             
     | 
| 
       6015 
6017 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       6016 
6018 
     | 
    
         
             
                std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
         
     | 
| 
       6017 
6019 
     | 
    
         
             
            #endif
         
     | 
    
        data/vendor/tmp/llama.cpp/ggml.c
    CHANGED
    
    | 
         @@ -60,6 +60,9 @@ 
     | 
|
| 
       60 
60 
     | 
    
         | 
| 
       61 
61 
     | 
    
         
             
            typedef volatile LONG atomic_int;
         
     | 
| 
       62 
62 
     | 
    
         
             
            typedef atomic_int atomic_bool;
         
     | 
| 
      
 63 
     | 
    
         
            +
            typedef atomic_int atomic_flag;
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
            #define ATOMIC_FLAG_INIT 0
         
     | 
| 
       63 
66 
     | 
    
         | 
| 
       64 
67 
     | 
    
         
             
            static void atomic_store(atomic_int * ptr, LONG val) {
         
     | 
| 
       65 
68 
     | 
    
         
             
                InterlockedExchange(ptr, val);
         
     | 
| 
         @@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { 
     | 
|
| 
       73 
76 
     | 
    
         
             
            static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
         
     | 
| 
       74 
77 
     | 
    
         
             
                return atomic_fetch_add(ptr, -(dec));
         
     | 
| 
       75 
78 
     | 
    
         
             
            }
         
     | 
| 
      
 79 
     | 
    
         
            +
            static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
         
     | 
| 
      
 80 
     | 
    
         
            +
                return InterlockedExchange(ptr, 1);
         
     | 
| 
      
 81 
     | 
    
         
            +
            }
         
     | 
| 
      
 82 
     | 
    
         
            +
            static void atomic_flag_clear(atomic_flag * ptr) {
         
     | 
| 
      
 83 
     | 
    
         
            +
                InterlockedExchange(ptr, 0);
         
     | 
| 
      
 84 
     | 
    
         
            +
            }
         
     | 
| 
       76 
85 
     | 
    
         | 
| 
       77 
86 
     | 
    
         
             
            typedef HANDLE pthread_t;
         
     | 
| 
       78 
87 
     | 
    
         | 
| 
         @@ -1567,11 +1576,11 @@ do {                                                              \ 
     | 
|
| 
       1567 
1576 
     | 
    
         | 
| 
       1568 
1577 
     | 
    
         
             
            // F16 arithmetic is not supported by AVX, so we use F32 instead
         
     | 
| 
       1569 
1578 
     | 
    
         | 
| 
       1570 
     | 
    
         
            -
            #define GGML_F32Cx8 
     | 
| 
      
 1579 
     | 
    
         
            +
            #define GGML_F32Cx8          __m256
         
     | 
| 
       1571 
1580 
     | 
    
         
             
            #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
         
     | 
| 
       1572 
1581 
     | 
    
         
             
            #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
         
     | 
| 
       1573 
1582 
     | 
    
         | 
| 
       1574 
     | 
    
         
            -
            static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
         
     | 
| 
      
 1583 
     | 
    
         
            +
            static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
         
     | 
| 
       1575 
1584 
     | 
    
         
             
                float tmp[8];
         
     | 
| 
       1576 
1585 
     | 
    
         | 
| 
       1577 
1586 
     | 
    
         
             
                for (int i = 0; i < 8; i++) {
         
     | 
| 
         @@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) { 
     | 
|
| 
       1580 
1589 
     | 
    
         | 
| 
       1581 
1590 
     | 
    
         
             
                return (__m256)__lasx_xvld(tmp, 0);
         
     | 
| 
       1582 
1591 
     | 
    
         
             
            }
         
     | 
| 
       1583 
     | 
    
         
            -
            static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
         
     | 
| 
      
 1592 
     | 
    
         
            +
            static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
         
     | 
| 
       1584 
1593 
     | 
    
         
             
                float arr[8];
         
     | 
| 
       1585 
1594 
     | 
    
         | 
| 
       1586 
1595 
     | 
    
         
             
                __lasx_xvst(y, arr, 0);
         
     | 
| 
       1587 
1596 
     | 
    
         | 
| 
       1588 
     | 
    
         
            -
                for (int i = 0; i < 8; i++)
         
     | 
| 
      
 1597 
     | 
    
         
            +
                for (int i = 0; i < 8; i++) {
         
     | 
| 
       1589 
1598 
     | 
    
         
             
                    x[i] = GGML_FP32_TO_FP16(arr[i]);
         
     | 
| 
      
 1599 
     | 
    
         
            +
                }
         
     | 
| 
       1590 
1600 
     | 
    
         
             
            }
         
     | 
| 
       1591 
1601 
     | 
    
         
             
            #define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
         
     | 
| 
       1592 
1602 
     | 
    
         
             
            #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
         
     | 
| 
         @@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) { 
     | 
|
| 
       1662 
1672 
     | 
    
         
             
            #define GGML_F16_STEP 32
         
     | 
| 
       1663 
1673 
     | 
    
         
             
            #define GGML_F16_EPR  4
         
     | 
| 
       1664 
1674 
     | 
    
         | 
| 
       1665 
     | 
    
         
            -
            static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
         
     | 
| 
      
 1675 
     | 
    
         
            +
            static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
         
     | 
| 
       1666 
1676 
     | 
    
         
             
                float tmp[4];
         
     | 
| 
       1667 
1677 
     | 
    
         | 
| 
       1668 
1678 
     | 
    
         
             
                tmp[0] = GGML_FP16_TO_FP32(x[0]);
         
     | 
| 
         @@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) { 
     | 
|
| 
       1673 
1683 
     | 
    
         
             
                return __lsx_vld(tmp, 0);
         
     | 
| 
       1674 
1684 
     | 
    
         
             
            }
         
     | 
| 
       1675 
1685 
     | 
    
         | 
| 
       1676 
     | 
    
         
            -
            static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
         
     | 
| 
      
 1686 
     | 
    
         
            +
            static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
         
     | 
| 
       1677 
1687 
     | 
    
         
             
                float arr[4];
         
     | 
| 
       1678 
1688 
     | 
    
         | 
| 
       1679 
1689 
     | 
    
         
             
                __lsx_vst(y, arr, 0);
         
     | 
| 
         @@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) { 
     | 
|
| 
       2306 
2316 
     | 
    
         
             
              const __m512 r = _mm512_set1_ps(0x1.8p23f);
         
     | 
| 
       2307 
2317 
     | 
    
         
             
              const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
         
     | 
| 
       2308 
2318 
     | 
    
         
             
              const __m512 n = _mm512_sub_ps(z, r);
         
     | 
| 
       2309 
     | 
    
         
            -
              const __m512 b = 
     | 
| 
       2310 
     | 
    
         
            -
             
     | 
| 
       2311 
     | 
    
         
            -
             
     | 
| 
       2312 
     | 
    
         
            -
              const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
         
     | 
| 
       2313 
     | 
    
         
            -
              const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
         
     | 
| 
       2314 
     | 
    
         
            -
              const __m512 u = _mm512_mul_ps(b, b);
         
     | 
| 
       2315 
     | 
    
         
            -
              const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
         
     | 
| 
       2316 
     | 
    
         
            -
                                                                               _mm512_set1_ps(0x1.573e2ep-5f)), u,
         
     | 
| 
       2317 
     | 
    
         
            -
                                                               _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
         
     | 
| 
       2318 
     | 
    
         
            -
                                                                               _mm512_set1_ps(0x1.fffdb6p-2f))),
         
     | 
| 
       2319 
     | 
    
         
            -
                                               u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
         
     | 
| 
       2320 
     | 
    
         
            -
              if (_mm512_kortestz(c, c))
         
     | 
| 
       2321 
     | 
    
         
            -
                return _mm512_fmadd_ps(j, k, k);
         
     | 
| 
       2322 
     | 
    
         
            -
              const __m512i g = _mm512_and_si512(
         
     | 
| 
       2323 
     | 
    
         
            -
                  _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
         
     | 
| 
       2324 
     | 
    
         
            -
                  _mm512_set1_epi32(0x82000000u));
         
     | 
| 
       2325 
     | 
    
         
            -
              const __m512 s1 =
         
     | 
| 
       2326 
     | 
    
         
            -
                  _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
         
     | 
| 
       2327 
     | 
    
         
            -
              const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
         
     | 
| 
      
 2319 
     | 
    
         
            +
              const __m512 b =
         
     | 
| 
      
 2320 
     | 
    
         
            +
                  _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
         
     | 
| 
      
 2321 
     | 
    
         
            +
                                   _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
         
     | 
| 
       2328 
2322 
     | 
    
         
             
              const __mmask16 d =
         
     | 
| 
       2329 
2323 
     | 
    
         
             
                  _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
         
     | 
| 
       2330 
     | 
    
         
            -
               
     | 
| 
       2331 
     | 
    
         
            -
             
     | 
| 
       2332 
     | 
    
         
            -
             
     | 
| 
       2333 
     | 
    
         
            -
             
     | 
| 
       2334 
     | 
    
         
            -
             
     | 
| 
      
 2324 
     | 
    
         
            +
              const __m512 u = _mm512_mul_ps(b, b);
         
     | 
| 
      
 2325 
     | 
    
         
            +
              const __m512 j = _mm512_fmadd_ps(
         
     | 
| 
      
 2326 
     | 
    
         
            +
                  _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
         
     | 
| 
      
 2327 
     | 
    
         
            +
                                                  _mm512_set1_ps(0x1.573e2ep-5f)),
         
     | 
| 
      
 2328 
     | 
    
         
            +
                                  u,
         
     | 
| 
      
 2329 
     | 
    
         
            +
                                  _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
         
     | 
| 
      
 2330 
     | 
    
         
            +
                                                  _mm512_set1_ps(0x1.fffdb6p-2f))),
         
     | 
| 
      
 2331 
     | 
    
         
            +
                  u,
         
     | 
| 
      
 2332 
     | 
    
         
            +
                  _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
         
     | 
| 
      
 2333 
     | 
    
         
            +
              const __m512 res = _mm512_scalef_ps(j, n);
         
     | 
| 
      
 2334 
     | 
    
         
            +
              if (_mm512_kortestz(d, d))
         
     | 
| 
      
 2335 
     | 
    
         
            +
                return res;
         
     | 
| 
      
 2336 
     | 
    
         
            +
              const __m512 zero = _mm512_setzero_ps();
         
     | 
| 
      
 2337 
     | 
    
         
            +
              const __m512 alt = _mm512_mask_blend_ps(
         
     | 
| 
      
 2338 
     | 
    
         
            +
                  _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
         
     | 
| 
      
 2339 
     | 
    
         
            +
              return _mm512_mask_blend_ps(d, res, alt);
         
     | 
| 
       2335 
2340 
     | 
    
         
             
            }
         
     | 
| 
       2336 
2341 
     | 
    
         | 
| 
       2337 
2342 
     | 
    
         
             
            // computes silu x/(1+exp(-x)) in single precision vector
         
     | 
| 
         @@ -2883,24 +2888,20 @@ struct ggml_state { 
     | 
|
| 
       2883 
2888 
     | 
    
         | 
| 
       2884 
2889 
     | 
    
         
             
            // global state
         
     | 
| 
       2885 
2890 
     | 
    
         
             
            static struct ggml_state g_state;
         
     | 
| 
       2886 
     | 
    
         
            -
            static  
     | 
| 
      
 2891 
     | 
    
         
            +
            static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
         
     | 
| 
       2887 
2892 
     | 
    
         | 
| 
       2888 
2893 
     | 
    
         
             
            // barrier via spin lock
         
     | 
| 
       2889 
2894 
     | 
    
         
             
            inline static void ggml_critical_section_start(void) {
         
     | 
| 
       2890 
     | 
    
         
            -
                 
     | 
| 
       2891 
     | 
    
         
            -
             
     | 
| 
       2892 
     | 
    
         
            -
             
     | 
| 
       2893 
     | 
    
         
            -
                    // wait for other threads to finish
         
     | 
| 
       2894 
     | 
    
         
            -
                    atomic_fetch_sub(&g_state_barrier, 1);
         
     | 
| 
       2895 
     | 
    
         
            -
                    sched_yield(); // TODO: reconsider this
         
     | 
| 
       2896 
     | 
    
         
            -
                    processing = atomic_fetch_add(&g_state_barrier, 1);
         
     | 
| 
      
 2895 
     | 
    
         
            +
                while (atomic_flag_test_and_set(&g_state_critical)) {
         
     | 
| 
      
 2896 
     | 
    
         
            +
                    // spin
         
     | 
| 
      
 2897 
     | 
    
         
            +
                    sched_yield();
         
     | 
| 
       2897 
2898 
     | 
    
         
             
                }
         
     | 
| 
       2898 
2899 
     | 
    
         
             
            }
         
     | 
| 
       2899 
2900 
     | 
    
         | 
| 
       2900 
2901 
     | 
    
         
             
            // TODO: make this somehow automatically executed
         
     | 
| 
       2901 
2902 
     | 
    
         
             
            //       some sort of "sentry" mechanism
         
     | 
| 
       2902 
2903 
     | 
    
         
             
            inline static void ggml_critical_section_end(void) {
         
     | 
| 
       2903 
     | 
    
         
            -
                 
     | 
| 
      
 2904 
     | 
    
         
            +
                atomic_flag_clear(&g_state_critical);
         
     | 
| 
       2904 
2905 
     | 
    
         
             
            }
         
     | 
| 
       2905 
2906 
     | 
    
         | 
| 
       2906 
2907 
     | 
    
         
             
            #if defined(__gnu_linux__)
         
     | 
| 
         @@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) { 
     | 
|
| 
       3216 
3217 
     | 
    
         
             
                    tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
         
     | 
| 
       3217 
3218 
     | 
    
         
             
            }
         
     | 
| 
       3218 
3219 
     | 
    
         | 
| 
       3219 
     | 
    
         
            -
             
     | 
| 
      
 3220 
     | 
    
         
            +
            GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
         
     | 
| 
      
 3221 
     | 
    
         
            +
                return ggml_is_contiguous(tensor);
         
     | 
| 
      
 3222 
     | 
    
         
            +
            }
         
     | 
| 
      
 3223 
     | 
    
         
            +
             
     | 
| 
      
 3224 
     | 
    
         
            +
            GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
         
     | 
| 
       3220 
3225 
     | 
    
         
             
                static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
         
     | 
| 
       3221 
3226 
     | 
    
         | 
| 
       3222 
3227 
     | 
    
         
             
                return
         
     | 
| 
         @@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te 
     | 
|
| 
       3225 
3230 
     | 
    
         
             
                    tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
         
     | 
| 
       3226 
3231 
     | 
    
         
             
            }
         
     | 
| 
       3227 
3232 
     | 
    
         | 
| 
      
 3233 
     | 
    
         
            +
            GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
         
     | 
| 
      
 3234 
     | 
    
         
            +
                static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
         
     | 
| 
      
 3235 
     | 
    
         
            +
             
     | 
| 
      
 3236 
     | 
    
         
            +
                return
         
     | 
| 
      
 3237 
     | 
    
         
            +
                    tensor->nb[0] == ggml_type_size(tensor->type) &&
         
     | 
| 
      
 3238 
     | 
    
         
            +
                    tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
         
     | 
| 
      
 3239 
     | 
    
         
            +
            }
         
     | 
| 
      
 3240 
     | 
    
         
            +
             
     | 
| 
       3228 
3241 
     | 
    
         
             
            GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
         
     | 
| 
       3229 
3242 
     | 
    
         
             
                static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
         
     | 
| 
       3230 
3243 
     | 
    
         | 
| 
         @@ -4882,10 +4895,21 @@ struct ggml_tensor * ggml_repeat_back( 
     | 
|
| 
       4882 
4895 
     | 
    
         
             
            // ggml_concat
         
     | 
| 
       4883 
4896 
     | 
    
         | 
| 
       4884 
4897 
     | 
    
         
             
            struct ggml_tensor * ggml_concat(
         
     | 
| 
       4885 
     | 
    
         
            -
                struct ggml_context* ctx,
         
     | 
| 
       4886 
     | 
    
         
            -
                struct ggml_tensor* a,
         
     | 
| 
       4887 
     | 
    
         
            -
                struct ggml_tensor* b 
     | 
| 
       4888 
     | 
    
         
            -
                 
     | 
| 
      
 4898 
     | 
    
         
            +
                struct ggml_context * ctx,
         
     | 
| 
      
 4899 
     | 
    
         
            +
                struct ggml_tensor * a,
         
     | 
| 
      
 4900 
     | 
    
         
            +
                struct ggml_tensor * b,
         
     | 
| 
      
 4901 
     | 
    
         
            +
                int dim) {
         
     | 
| 
      
 4902 
     | 
    
         
            +
                GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
         
     | 
| 
      
 4903 
     | 
    
         
            +
             
     | 
| 
      
 4904 
     | 
    
         
            +
                int64_t ne[GGML_MAX_DIMS];
         
     | 
| 
      
 4905 
     | 
    
         
            +
                for (int d = 0; d < GGML_MAX_DIMS; ++d) {
         
     | 
| 
      
 4906 
     | 
    
         
            +
                    if (d == dim) {
         
     | 
| 
      
 4907 
     | 
    
         
            +
                        ne[d] = a->ne[d] + b->ne[d];
         
     | 
| 
      
 4908 
     | 
    
         
            +
                        continue;
         
     | 
| 
      
 4909 
     | 
    
         
            +
                    }
         
     | 
| 
      
 4910 
     | 
    
         
            +
                    GGML_ASSERT(a->ne[d] == b->ne[d]);
         
     | 
| 
      
 4911 
     | 
    
         
            +
                    ne[d] = a->ne[d];
         
     | 
| 
      
 4912 
     | 
    
         
            +
                }
         
     | 
| 
       4889 
4913 
     | 
    
         | 
| 
       4890 
4914 
     | 
    
         
             
                bool is_node = false;
         
     | 
| 
       4891 
4915 
     | 
    
         | 
| 
         @@ -4893,7 +4917,9 @@ struct ggml_tensor * ggml_concat( 
     | 
|
| 
       4893 
4917 
     | 
    
         
             
                    is_node = true;
         
     | 
| 
       4894 
4918 
     | 
    
         
             
                }
         
     | 
| 
       4895 
4919 
     | 
    
         | 
| 
       4896 
     | 
    
         
            -
                struct ggml_tensor * result =  
     | 
| 
      
 4920 
     | 
    
         
            +
                struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
         
     | 
| 
      
 4921 
     | 
    
         
            +
             
     | 
| 
      
 4922 
     | 
    
         
            +
                ggml_set_op_params_i32(result, 0, dim);
         
     | 
| 
       4897 
4923 
     | 
    
         | 
| 
       4898 
4924 
     | 
    
         
             
                result->op = GGML_OP_CONCAT;
         
     | 
| 
       4899 
4925 
     | 
    
         
             
                result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
         
     | 
| 
         @@ -5013,6 +5039,7 @@ struct ggml_tensor * ggml_leaky_relu( 
     | 
|
| 
       5013 
5039 
     | 
    
         
             
                }
         
     | 
| 
       5014 
5040 
     | 
    
         | 
| 
       5015 
5041 
     | 
    
         
             
                struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
         
     | 
| 
      
 5042 
     | 
    
         
            +
             
     | 
| 
       5016 
5043 
     | 
    
         
             
                ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
         
     | 
| 
       5017 
5044 
     | 
    
         | 
| 
       5018 
5045 
     | 
    
         
             
                result->op   = GGML_OP_LEAKY_RELU;
         
     | 
| 
         @@ -6378,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace( 
     | 
|
| 
       6378 
6405 
     | 
    
         
             
                );
         
     | 
| 
       6379 
6406 
     | 
    
         
             
            }
         
     | 
| 
       6380 
6407 
     | 
    
         | 
| 
      
 6408 
     | 
    
         
            +
            struct ggml_tensor * ggml_rope_xpos_inplace(
         
     | 
| 
      
 6409 
     | 
    
         
            +
                    struct ggml_context * ctx,
         
     | 
| 
      
 6410 
     | 
    
         
            +
                    struct ggml_tensor  * a,
         
     | 
| 
      
 6411 
     | 
    
         
            +
                    struct ggml_tensor  * b,
         
     | 
| 
      
 6412 
     | 
    
         
            +
                    int                   n_dims,
         
     | 
| 
      
 6413 
     | 
    
         
            +
                    float                 base,
         
     | 
| 
      
 6414 
     | 
    
         
            +
                    bool                  down) {
         
     | 
| 
      
 6415 
     | 
    
         
            +
                return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
         
     | 
| 
      
 6416 
     | 
    
         
            +
            }
         
     | 
| 
      
 6417 
     | 
    
         
            +
             
     | 
| 
       6381 
6418 
     | 
    
         
             
            // ggml_rope_back
         
     | 
| 
       6382 
6419 
     | 
    
         | 
| 
       6383 
6420 
     | 
    
         
             
            struct ggml_tensor * ggml_rope_back(
         
     | 
| 
         @@ -10967,26 +11004,29 @@ static void ggml_compute_forward_concat_f32( 
     | 
|
| 
       10967 
11004 
     | 
    
         
             
                GGML_ASSERT(nb00 == sizeof(float));
         
     | 
| 
       10968 
11005 
     | 
    
         
             
                GGML_ASSERT(nb10 == sizeof(float));
         
     | 
| 
       10969 
11006 
     | 
    
         | 
| 
      
 11007 
     | 
    
         
            +
                const int32_t dim = ggml_get_op_params_i32(dst, 0);
         
     | 
| 
      
 11008 
     | 
    
         
            +
             
     | 
| 
      
 11009 
     | 
    
         
            +
                GGML_ASSERT(dim >= 0 && dim < 4);
         
     | 
| 
      
 11010 
     | 
    
         
            +
             
     | 
| 
      
 11011 
     | 
    
         
            +
                int64_t o[4] = {0, 0, 0, 0};
         
     | 
| 
      
 11012 
     | 
    
         
            +
                o[dim] = src0->ne[dim];
         
     | 
| 
      
 11013 
     | 
    
         
            +
             
     | 
| 
      
 11014 
     | 
    
         
            +
                const float * x;
         
     | 
| 
      
 11015 
     | 
    
         
            +
             
     | 
| 
      
 11016 
     | 
    
         
            +
                // TODO: smarter multi-theading
         
     | 
| 
       10970 
11017 
     | 
    
         
             
                for (int i3 = 0; i3 < ne3; i3++) {
         
     | 
| 
       10971 
11018 
     | 
    
         
             
                    for (int i2 = ith; i2 < ne2; i2 += nth) {
         
     | 
| 
       10972 
     | 
    
         
            -
                         
     | 
| 
       10973 
     | 
    
         
            -
                            for (int  
     | 
| 
       10974 
     | 
    
         
            -
                                 
     | 
| 
       10975 
     | 
    
         
            -
                                     
     | 
| 
       10976 
     | 
    
         
            -
             
     | 
| 
       10977 
     | 
    
         
            -
                                     
     | 
| 
       10978 
     | 
    
         
            -
                                    *y = *x;
         
     | 
| 
       10979 
     | 
    
         
            -
                                }
         
     | 
| 
       10980 
     | 
    
         
            -
                            }
         
     | 
| 
       10981 
     | 
    
         
            -
                        } // src1
         
     | 
| 
       10982 
     | 
    
         
            -
                        else {
         
     | 
| 
       10983 
     | 
    
         
            -
                            for (int i1 = 0; i1 < ne1; i1++) {
         
     | 
| 
       10984 
     | 
    
         
            -
                                for (int i0 = 0; i0 < ne0; i0++) {
         
     | 
| 
       10985 
     | 
    
         
            -
                                    const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
         
     | 
| 
       10986 
     | 
    
         
            -
             
     | 
| 
       10987 
     | 
    
         
            -
                                    float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
         
     | 
| 
       10988 
     | 
    
         
            -
                                    *y = *x;
         
     | 
| 
      
 11019 
     | 
    
         
            +
                        for (int i1 = 0; i1 < ne1; i1++) {
         
     | 
| 
      
 11020 
     | 
    
         
            +
                            for (int i0 = 0; i0 < ne0; i0++) {
         
     | 
| 
      
 11021 
     | 
    
         
            +
                                if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
         
     | 
| 
      
 11022 
     | 
    
         
            +
                                    x = (const float *) ((const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03);
         
     | 
| 
      
 11023 
     | 
    
         
            +
                                } else {
         
     | 
| 
      
 11024 
     | 
    
         
            +
                                    x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
         
     | 
| 
       10989 
11025 
     | 
    
         
             
                                }
         
     | 
| 
      
 11026 
     | 
    
         
            +
             
     | 
| 
      
 11027 
     | 
    
         
            +
                                float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
         
     | 
| 
      
 11028 
     | 
    
         
            +
             
     | 
| 
      
 11029 
     | 
    
         
            +
                                *y = *x;
         
     | 
| 
       10990 
11030 
     | 
    
         
             
                            }
         
     | 
| 
       10991 
11031 
     | 
    
         
             
                        }
         
     | 
| 
       10992 
11032 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -10994,8 +11034,8 @@ static void ggml_compute_forward_concat_f32( 
     | 
|
| 
       10994 
11034 
     | 
    
         
             
            }
         
     | 
| 
       10995 
11035 
     | 
    
         | 
| 
       10996 
11036 
     | 
    
         
             
            static void ggml_compute_forward_concat(
         
     | 
| 
       10997 
     | 
    
         
            -
                const struct ggml_compute_params* params,
         
     | 
| 
       10998 
     | 
    
         
            -
                struct ggml_tensor* dst) {
         
     | 
| 
      
 11037 
     | 
    
         
            +
                const struct ggml_compute_params * params,
         
     | 
| 
      
 11038 
     | 
    
         
            +
                struct ggml_tensor * dst) {
         
     | 
| 
       10999 
11039 
     | 
    
         | 
| 
       11000 
11040 
     | 
    
         
             
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       11001 
11041 
     | 
    
         | 
| 
         @@ -11388,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32( 
     | 
|
| 
       11388 
11428 
     | 
    
         | 
| 
       11389 
11429 
     | 
    
         
             
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       11390 
11430 
     | 
    
         | 
| 
       11391 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       11392 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 11431 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(src0));
         
     | 
| 
      
 11432 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(dst));
         
     | 
| 
       11393 
11433 
     | 
    
         
             
                GGML_ASSERT(ggml_are_same_shape(src0, dst));
         
     | 
| 
       11394 
11434 
     | 
    
         | 
| 
       11395 
11435 
     | 
    
         
             
                if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         
     | 
| 
         @@ -11451,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32( 
     | 
|
| 
       11451 
11491 
     | 
    
         | 
| 
       11452 
11492 
     | 
    
         
             
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       11453 
11493 
     | 
    
         | 
| 
       11454 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       11455 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 11494 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(src0));
         
     | 
| 
      
 11495 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(dst));
         
     | 
| 
       11456 
11496 
     | 
    
         
             
                GGML_ASSERT(ggml_are_same_shape(src0, dst));
         
     | 
| 
       11457 
11497 
     | 
    
         | 
| 
       11458 
11498 
     | 
    
         
             
                if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         
     | 
| 
         @@ -11514,8 +11554,8 @@ static void ggml_compute_forward_silu_f32( 
     | 
|
| 
       11514 
11554 
     | 
    
         | 
| 
       11515 
11555 
     | 
    
         
             
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       11516 
11556 
     | 
    
         | 
| 
       11517 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       11518 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 11557 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(src0));
         
     | 
| 
      
 11558 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(dst));
         
     | 
| 
       11519 
11559 
     | 
    
         
             
                GGML_ASSERT(ggml_are_same_shape(src0, dst));
         
     | 
| 
       11520 
11560 
     | 
    
         | 
| 
       11521 
11561 
     | 
    
         
             
                if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         
     | 
| 
         @@ -11626,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32( 
     | 
|
| 
       11626 
11666 
     | 
    
         
             
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       11627 
11667 
     | 
    
         
             
                const struct ggml_tensor * grad = dst->src[1];
         
     | 
| 
       11628 
11668 
     | 
    
         | 
| 
       11629 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       11630 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       11631 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 11669 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(grad));
         
     | 
| 
      
 11670 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(src0));
         
     | 
| 
      
 11671 
     | 
    
         
            +
                GGML_ASSERT(ggml_is_contiguous_1(dst));
         
     | 
| 
       11632 
11672 
     | 
    
         
             
                GGML_ASSERT(ggml_are_same_shape(src0, dst));
         
     | 
| 
       11633 
11673 
     | 
    
         
             
                GGML_ASSERT(ggml_are_same_shape(src0, grad));
         
     | 
| 
       11634 
11674 
     | 
    
         | 
| 
         @@ -14326,7 +14366,7 @@ static void ggml_compute_forward_rope_f32( 
     | 
|
| 
       14326 
14366 
     | 
    
         
             
                int ir = 0;
         
     | 
| 
       14327 
14367 
     | 
    
         | 
| 
       14328 
14368 
     | 
    
         
             
                const float theta_scale = powf(freq_base, -2.0f/n_dims);
         
     | 
| 
       14329 
     | 
    
         
            -
             
     | 
| 
      
 14369 
     | 
    
         
            +
             
     | 
| 
       14330 
14370 
     | 
    
         
             
                float corr_dims[2];
         
     | 
| 
       14331 
14371 
     | 
    
         
             
                ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
         
     | 
| 
       14332 
14372 
     | 
    
         | 
| 
         @@ -14375,7 +14415,7 @@ static void ggml_compute_forward_rope_f32( 
     | 
|
| 
       14375 
14415 
     | 
    
         
             
                                    const float cos_block_theta = cosf(block_theta);
         
     | 
| 
       14376 
14416 
     | 
    
         
             
                                    const float sin_block_theta = sinf(block_theta) * sin_sign;
         
     | 
| 
       14377 
14417 
     | 
    
         | 
| 
       14378 
     | 
    
         
            -
                                    theta_base 
     | 
| 
      
 14418 
     | 
    
         
            +
                                    theta_base  *= theta_scale;
         
     | 
| 
       14379 
14419 
     | 
    
         
             
                                    block_theta *= theta_scale;
         
     | 
| 
       14380 
14420 
     | 
    
         | 
| 
       14381 
14421 
     | 
    
         
             
                                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         
     | 
| 
         @@ -14410,29 +14450,22 @@ static void ggml_compute_forward_rope_f32( 
     | 
|
| 
       14410 
14450 
     | 
    
         
             
                                    dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
         
     | 
| 
       14411 
14451 
     | 
    
         
             
                                }
         
     | 
| 
       14412 
14452 
     | 
    
         
             
                            } else {
         
     | 
| 
       14413 
     | 
    
         
            -
                                //  
     | 
| 
       14414 
     | 
    
         
            -
                                //       it seems we have to rope just the first n_dims elements and do nothing with the rest
         
     | 
| 
       14415 
     | 
    
         
            -
                                // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
         
     | 
| 
       14416 
     | 
    
         
            -
                                theta_base *= freq_scale;
         
     | 
| 
      
 14453 
     | 
    
         
            +
                                // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
         
     | 
| 
       14417 
14454 
     | 
    
         
             
                                for (int64_t ic = 0; ic < ne0; ic += 2) {
         
     | 
| 
       14418 
14455 
     | 
    
         
             
                                    if (ic < n_dims) {
         
     | 
| 
       14419 
     | 
    
         
            -
                                        const int64_t  
     | 
| 
      
 14456 
     | 
    
         
            +
                                        const int64_t i0 = ic/2;
         
     | 
| 
       14420 
14457 
     | 
    
         | 
| 
       14421 
     | 
    
         
            -
                                         
     | 
| 
       14422 
     | 
    
         
            -
                                        float cur_rot = inv_ndims * ic - ib;
         
     | 
| 
       14423 
     | 
    
         
            -
                                        float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
         
     | 
| 
      
 14458 
     | 
    
         
            +
                                        const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
         
     | 
| 
       14424 
14459 
     | 
    
         | 
| 
       14425 
14460 
     | 
    
         
             
                                        float cos_theta, sin_theta;
         
     | 
| 
       14426 
14461 
     | 
    
         
             
                                        rope_yarn(
         
     | 
| 
       14427 
     | 
    
         
            -
                                            theta_base/freq_factor, freq_scale, corr_dims,  
     | 
| 
      
 14462 
     | 
    
         
            +
                                            theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
         
     | 
| 
       14428 
14463 
     | 
    
         
             
                                            &cos_theta, &sin_theta
         
     | 
| 
       14429 
14464 
     | 
    
         
             
                                        );
         
     | 
| 
       14430 
     | 
    
         
            -
                                        sin_theta *= sin_sign;
         
     | 
| 
       14431 
14465 
     | 
    
         | 
| 
      
 14466 
     | 
    
         
            +
                                        sin_theta  *= sin_sign;
         
     | 
| 
       14432 
14467 
     | 
    
         
             
                                        theta_base *= theta_scale;
         
     | 
| 
       14433 
14468 
     | 
    
         | 
| 
       14434 
     | 
    
         
            -
                                        const int64_t i0 = ib*n_dims + ic/2;
         
     | 
| 
       14435 
     | 
    
         
            -
             
     | 
| 
       14436 
14469 
     | 
    
         
             
                                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         
     | 
| 
       14437 
14470 
     | 
    
         
             
                                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
         
     | 
| 
       14438 
14471 
     | 
    
         | 
| 
         @@ -14511,7 +14544,7 @@ static void ggml_compute_forward_rope_f16( 
     | 
|
| 
       14511 
14544 
     | 
    
         
             
                int ir = 0;
         
     | 
| 
       14512 
14545 
     | 
    
         | 
| 
       14513 
14546 
     | 
    
         
             
                const float theta_scale = powf(freq_base, -2.0f/n_dims);
         
     | 
| 
       14514 
     | 
    
         
            -
             
     | 
| 
      
 14547 
     | 
    
         
            +
             
     | 
| 
       14515 
14548 
     | 
    
         
             
                float corr_dims[2];
         
     | 
| 
       14516 
14549 
     | 
    
         
             
                ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
         
     | 
| 
       14517 
14550 
     | 
    
         | 
| 
         @@ -14560,7 +14593,7 @@ static void ggml_compute_forward_rope_f16( 
     | 
|
| 
       14560 
14593 
     | 
    
         
             
                                    const float cos_block_theta = cosf(block_theta);
         
     | 
| 
       14561 
14594 
     | 
    
         
             
                                    const float sin_block_theta = sinf(block_theta) * sin_sign;
         
     | 
| 
       14562 
14595 
     | 
    
         | 
| 
       14563 
     | 
    
         
            -
                                    theta_base 
     | 
| 
      
 14596 
     | 
    
         
            +
                                    theta_base  *= theta_scale;
         
     | 
| 
       14564 
14597 
     | 
    
         
             
                                    block_theta *= theta_scale;
         
     | 
| 
       14565 
14598 
     | 
    
         | 
| 
       14566 
14599 
     | 
    
         
             
                                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         
     | 
| 
         @@ -14591,29 +14624,22 @@ static void ggml_compute_forward_rope_f16( 
     | 
|
| 
       14591 
14624 
     | 
    
         
             
                                    dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
         
     | 
| 
       14592 
14625 
     | 
    
         
             
                                }
         
     | 
| 
       14593 
14626 
     | 
    
         
             
                            } else {
         
     | 
| 
       14594 
     | 
    
         
            -
                                //  
     | 
| 
       14595 
     | 
    
         
            -
                                //       it seems we have to rope just the first n_dims elements and do nothing with the rest
         
     | 
| 
       14596 
     | 
    
         
            -
                                // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
         
     | 
| 
       14597 
     | 
    
         
            -
                                theta_base *= freq_scale;
         
     | 
| 
      
 14627 
     | 
    
         
            +
                                // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
         
     | 
| 
       14598 
14628 
     | 
    
         
             
                                for (int64_t ic = 0; ic < ne0; ic += 2) {
         
     | 
| 
       14599 
14629 
     | 
    
         
             
                                    if (ic < n_dims) {
         
     | 
| 
       14600 
     | 
    
         
            -
                                        const int64_t  
     | 
| 
      
 14630 
     | 
    
         
            +
                                        const int64_t i0 = ic/2;
         
     | 
| 
       14601 
14631 
     | 
    
         | 
| 
       14602 
     | 
    
         
            -
                                         
     | 
| 
       14603 
     | 
    
         
            -
                                        float cur_rot = inv_ndims * ic - ib;
         
     | 
| 
       14604 
     | 
    
         
            -
                                        float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
         
     | 
| 
      
 14632 
     | 
    
         
            +
                                        const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
         
     | 
| 
       14605 
14633 
     | 
    
         | 
| 
       14606 
14634 
     | 
    
         
             
                                        float cos_theta, sin_theta;
         
     | 
| 
       14607 
14635 
     | 
    
         
             
                                        rope_yarn(
         
     | 
| 
       14608 
     | 
    
         
            -
                                            theta_base/freq_factor, freq_scale, corr_dims,  
     | 
| 
      
 14636 
     | 
    
         
            +
                                            theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
         
     | 
| 
       14609 
14637 
     | 
    
         
             
                                            &cos_theta, &sin_theta
         
     | 
| 
       14610 
14638 
     | 
    
         
             
                                        );
         
     | 
| 
       14611 
     | 
    
         
            -
                                        sin_theta *= sin_sign;
         
     | 
| 
       14612 
14639 
     | 
    
         | 
| 
      
 14640 
     | 
    
         
            +
                                        sin_theta  *= sin_sign;
         
     | 
| 
       14613 
14641 
     | 
    
         
             
                                        theta_base *= theta_scale;
         
     | 
| 
       14614 
14642 
     | 
    
         | 
| 
       14615 
     | 
    
         
            -
                                        const int64_t i0 = ib*n_dims + ic/2;
         
     | 
| 
       14616 
     | 
    
         
            -
             
     | 
| 
       14617 
14643 
     | 
    
         
             
                                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         
     | 
| 
       14618 
14644 
     | 
    
         
             
                                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
         
     | 
| 
       14619 
14645 
     | 
    
         | 
| 
         @@ -22742,6 +22768,16 @@ int ggml_cpu_has_neon(void) { 
     | 
|
| 
       22742 
22768 
     | 
    
         
             
            #endif
         
     | 
| 
       22743 
22769 
     | 
    
         
             
            }
         
     | 
| 
       22744 
22770 
     | 
    
         | 
| 
      
 22771 
     | 
    
         
            +
            int ggml_cpu_has_sve(void) {
         
     | 
| 
      
 22772 
     | 
    
         
            +
            #if defined(__ARM_FEATURE_SVE)
         
     | 
| 
      
 22773 
     | 
    
         
            +
                // TODO: Currently, SVE 256 bit is only supported.
         
     | 
| 
      
 22774 
     | 
    
         
            +
                GGML_ASSERT(svcntb() == QK8_0);
         
     | 
| 
      
 22775 
     | 
    
         
            +
                return 1;
         
     | 
| 
      
 22776 
     | 
    
         
            +
            #else
         
     | 
| 
      
 22777 
     | 
    
         
            +
                return 0;
         
     | 
| 
      
 22778 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 22779 
     | 
    
         
            +
            }
         
     | 
| 
      
 22780 
     | 
    
         
            +
             
     | 
| 
       22745 
22781 
     | 
    
         
             
            int ggml_cpu_has_arm_fma(void) {
         
     | 
| 
       22746 
22782 
     | 
    
         
             
            #if defined(__ARM_FEATURE_FMA)
         
     | 
| 
       22747 
22783 
     | 
    
         
             
                return 1;
         
     | 
| 
         @@ -22830,6 +22866,14 @@ int ggml_cpu_has_sycl(void) { 
     | 
|
| 
       22830 
22866 
     | 
    
         
             
            #endif
         
     | 
| 
       22831 
22867 
     | 
    
         
             
            }
         
     | 
| 
       22832 
22868 
     | 
    
         | 
| 
      
 22869 
     | 
    
         
            +
            int ggml_cpu_has_rpc(void) {
         
     | 
| 
      
 22870 
     | 
    
         
            +
            #if defined(GGML_USE_RPC)
         
     | 
| 
      
 22871 
     | 
    
         
            +
                return 1;
         
     | 
| 
      
 22872 
     | 
    
         
            +
            #else
         
     | 
| 
      
 22873 
     | 
    
         
            +
                return 0;
         
     | 
| 
      
 22874 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 22875 
     | 
    
         
            +
            }
         
     | 
| 
      
 22876 
     | 
    
         
            +
             
     | 
| 
       22833 
22877 
     | 
    
         
             
            int ggml_cpu_has_gpublas(void) {
         
     | 
| 
       22834 
22878 
     | 
    
         
             
                return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
         
     | 
| 
       22835 
22879 
     | 
    
         
             
                       ggml_cpu_has_sycl();
         
     | 
    
        data/vendor/tmp/llama.cpp/ggml.h
    CHANGED
    
    | 
         @@ -756,7 +756,6 @@ extern "C" { 
     | 
|
| 
       756 
756 
     | 
    
         
             
                GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
         
     | 
| 
       757 
757 
     | 
    
         | 
| 
       758 
758 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
         
     | 
| 
       759 
     | 
    
         
            -
                GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
         
     | 
| 
       760 
759 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
         
     | 
| 
       761 
760 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
         
     | 
| 
       762 
761 
     | 
    
         
             
                GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
         
     | 
| 
         @@ -765,6 +764,11 @@ extern "C" { 
     | 
|
| 
       765 
764 
     | 
    
         
             
                GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
         
     | 
| 
       766 
765 
     | 
    
         
             
                GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
         
     | 
| 
       767 
766 
     | 
    
         | 
| 
      
 767 
     | 
    
         
            +
                GGML_API GGML_CALL bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
         
     | 
| 
      
 768 
     | 
    
         
            +
                GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
         
     | 
| 
      
 769 
     | 
    
         
            +
                GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
         
     | 
| 
      
 770 
     | 
    
         
            +
                GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
         
     | 
| 
      
 771 
     | 
    
         
            +
             
     | 
| 
       768 
772 
     | 
    
         
             
                GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
         
     | 
| 
       769 
773 
     | 
    
         
             
                GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
         
     | 
| 
       770 
774 
     | 
    
         | 
| 
         @@ -1007,12 +1011,13 @@ extern "C" { 
     | 
|
| 
       1007 
1011 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
       1008 
1012 
     | 
    
         
             
                        struct ggml_tensor  * b);
         
     | 
| 
       1009 
1013 
     | 
    
         | 
| 
       1010 
     | 
    
         
            -
                // concat a and b  
     | 
| 
      
 1014 
     | 
    
         
            +
                // concat a and b along dim
         
     | 
| 
       1011 
1015 
     | 
    
         
             
                // used in stable-diffusion
         
     | 
| 
       1012 
1016 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_concat(
         
     | 
| 
       1013 
1017 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       1014 
1018 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
       1015 
     | 
    
         
            -
                        struct ggml_tensor  * b 
     | 
| 
      
 1019 
     | 
    
         
            +
                        struct ggml_tensor  * b,
         
     | 
| 
      
 1020 
     | 
    
         
            +
                        int                   dim);
         
     | 
| 
       1016 
1021 
     | 
    
         | 
| 
       1017 
1022 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_abs(
         
     | 
| 
       1018 
1023 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
         @@ -1547,6 +1552,14 @@ extern "C" { 
     | 
|
| 
       1547 
1552 
     | 
    
         
             
                        float                 beta_slow),
         
     | 
| 
       1548 
1553 
     | 
    
         
             
                    "use ggml_rope_ext_inplace instead");
         
     | 
| 
       1549 
1554 
     | 
    
         | 
| 
      
 1555 
     | 
    
         
            +
                struct ggml_tensor * ggml_rope_xpos_inplace(
         
     | 
| 
      
 1556 
     | 
    
         
            +
                    struct ggml_context * ctx,
         
     | 
| 
      
 1557 
     | 
    
         
            +
                    struct ggml_tensor  * a,
         
     | 
| 
      
 1558 
     | 
    
         
            +
                    struct ggml_tensor  * b,
         
     | 
| 
      
 1559 
     | 
    
         
            +
                    int                   n_dims,
         
     | 
| 
      
 1560 
     | 
    
         
            +
                    float                 base,
         
     | 
| 
      
 1561 
     | 
    
         
            +
                    bool                  down);
         
     | 
| 
      
 1562 
     | 
    
         
            +
             
     | 
| 
       1550 
1563 
     | 
    
         
             
                // compute correction dims for YaRN RoPE scaling
         
     | 
| 
       1551 
1564 
     | 
    
         
             
                GGML_CALL void ggml_rope_yarn_corr_dims(
         
     | 
| 
       1552 
1565 
     | 
    
         
             
                    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
         
     | 
| 
         @@ -2404,6 +2417,7 @@ extern "C" { 
     | 
|
| 
       2404 
2417 
     | 
    
         
             
                GGML_API int ggml_cpu_has_avx512_bf16(void);
         
     | 
| 
       2405 
2418 
     | 
    
         
             
                GGML_API int ggml_cpu_has_fma        (void);
         
     | 
| 
       2406 
2419 
     | 
    
         
             
                GGML_API int ggml_cpu_has_neon       (void);
         
     | 
| 
      
 2420 
     | 
    
         
            +
                GGML_API int ggml_cpu_has_sve        (void);
         
     | 
| 
       2407 
2421 
     | 
    
         
             
                GGML_API int ggml_cpu_has_arm_fma    (void);
         
     | 
| 
       2408 
2422 
     | 
    
         
             
                GGML_API int ggml_cpu_has_metal      (void);
         
     | 
| 
       2409 
2423 
     | 
    
         
             
                GGML_API int ggml_cpu_has_f16c       (void);
         
     | 
| 
         @@ -2418,6 +2432,7 @@ extern "C" { 
     | 
|
| 
       2418 
2432 
     | 
    
         
             
                GGML_API int ggml_cpu_has_sse3       (void);
         
     | 
| 
       2419 
2433 
     | 
    
         
             
                GGML_API int ggml_cpu_has_ssse3      (void);
         
     | 
| 
       2420 
2434 
     | 
    
         
             
                GGML_API int ggml_cpu_has_sycl       (void);
         
     | 
| 
      
 2435 
     | 
    
         
            +
                GGML_API int ggml_cpu_has_rpc        (void);
         
     | 
| 
       2421 
2436 
     | 
    
         
             
                GGML_API int ggml_cpu_has_vsx        (void);
         
     | 
| 
       2422 
2437 
     | 
    
         
             
                GGML_API int ggml_cpu_has_matmul_int8(void);
         
     | 
| 
       2423 
2438 
     | 
    
         |