alglib4 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/README.md +47 -0
 - data/ext/alglib/alglib.cpp +537 -0
 - data/ext/alglib/alglib_array_converters.cpp +86 -0
 - data/ext/alglib/alglib_array_converters.h +15 -0
 - data/ext/alglib/alglib_utils.cpp +10 -0
 - data/ext/alglib/alglib_utils.h +6 -0
 - data/ext/alglib/alglibinternal.cpp +21749 -0
 - data/ext/alglib/alglibinternal.h +2168 -0
 - data/ext/alglib/alglibmisc.cpp +9106 -0
 - data/ext/alglib/alglibmisc.h +2114 -0
 - data/ext/alglib/ap.cpp +20094 -0
 - data/ext/alglib/ap.h +7244 -0
 - data/ext/alglib/dataanalysis.cpp +52588 -0
 - data/ext/alglib/dataanalysis.h +10601 -0
 - data/ext/alglib/diffequations.cpp +1342 -0
 - data/ext/alglib/diffequations.h +282 -0
 - data/ext/alglib/extconf.rb +5 -0
 - data/ext/alglib/fasttransforms.cpp +4696 -0
 - data/ext/alglib/fasttransforms.h +1018 -0
 - data/ext/alglib/integration.cpp +4249 -0
 - data/ext/alglib/integration.h +869 -0
 - data/ext/alglib/interpolation.cpp +74502 -0
 - data/ext/alglib/interpolation.h +12264 -0
 - data/ext/alglib/kernels_avx2.cpp +2171 -0
 - data/ext/alglib/kernels_avx2.h +201 -0
 - data/ext/alglib/kernels_fma.cpp +1065 -0
 - data/ext/alglib/kernels_fma.h +137 -0
 - data/ext/alglib/kernels_sse2.cpp +735 -0
 - data/ext/alglib/kernels_sse2.h +100 -0
 - data/ext/alglib/linalg.cpp +65182 -0
 - data/ext/alglib/linalg.h +9927 -0
 - data/ext/alglib/optimization.cpp +135331 -0
 - data/ext/alglib/optimization.h +19235 -0
 - data/ext/alglib/solvers.cpp +20488 -0
 - data/ext/alglib/solvers.h +4781 -0
 - data/ext/alglib/specialfunctions.cpp +10672 -0
 - data/ext/alglib/specialfunctions.h +2305 -0
 - data/ext/alglib/statistics.cpp +19791 -0
 - data/ext/alglib/statistics.h +1359 -0
 - data/ext/alglib/stdafx.h +2 -0
 - data/gpl2.txt +339 -0
 - data/gpl3.txt +674 -0
 - data/lib/alglib/version.rb +3 -0
 - data/lib/alglib.rb +4 -0
 - metadata +101 -0
 
| 
         @@ -0,0 +1,2171 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 2 
     | 
    
         
            +
            ALGLIB 4.04.0 (source code generated 2024-12-21)
         
     | 
| 
      
 3 
     | 
    
         
            +
            Copyright (c) Sergey Bochkanov (ALGLIB project).
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            >>> SOURCE LICENSE >>>
         
     | 
| 
      
 6 
     | 
    
         
            +
            This program is free software; you can redistribute it and/or modify
         
     | 
| 
      
 7 
     | 
    
         
            +
            it under the terms of the GNU General Public License as published by
         
     | 
| 
      
 8 
     | 
    
         
            +
            the Free Software Foundation (www.fsf.org); either version 2 of the 
         
     | 
| 
      
 9 
     | 
    
         
            +
            License, or (at your option) any later version.
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            This program is distributed in the hope that it will be useful,
         
     | 
| 
      
 12 
     | 
    
         
            +
            but WITHOUT ANY WARRANTY; without even the implied warranty of
         
     | 
| 
      
 13 
     | 
    
         
            +
            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
         
     | 
| 
      
 14 
     | 
    
         
            +
            GNU General Public License for more details.
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            A copy of the GNU General Public License is available at
         
     | 
| 
      
 17 
     | 
    
         
            +
            http://www.fsf.org/licensing/licenses
         
     | 
| 
      
 18 
     | 
    
         
            +
            >>> END OF LICENSE >>>
         
     | 
| 
      
 19 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 20 
     | 
    
         
            +
            #ifdef _MSC_VER
         
     | 
| 
      
 21 
     | 
    
         
            +
            #define _CRT_SECURE_NO_WARNINGS
         
     | 
| 
      
 22 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 23 
     | 
    
         
            +
            #include "stdafx.h"
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            //
         
     | 
| 
      
 26 
     | 
    
         
            +
            // Must be defined before we include kernel header
         
     | 
| 
      
 27 
     | 
    
         
            +
            //
         
     | 
| 
      
 28 
     | 
    
         
            +
            #define _ALGLIB_IMPL_DEFINES
         
     | 
| 
      
 29 
     | 
    
         
            +
            #define _ALGLIB_INTEGRITY_CHECKS_ONCE
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
            #include "kernels_avx2.h"
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
            // disable some irrelevant warnings
         
     | 
| 
      
 34 
     | 
    
         
            +
            #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
         
     | 
| 
      
 35 
     | 
    
         
            +
            #pragma warning(disable:4100)
         
     | 
| 
      
 36 
     | 
    
         
            +
            #pragma warning(disable:4127)
         
     | 
| 
      
 37 
     | 
    
         
            +
            #pragma warning(disable:4611)
         
     | 
| 
      
 38 
     | 
    
         
            +
            #pragma warning(disable:4702)
         
     | 
| 
      
 39 
     | 
    
         
            +
            #pragma warning(disable:4996)
         
     | 
| 
      
 40 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            namespace alglib_impl
         
     | 
| 
      
 43 
     | 
    
         
            +
            {
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
            #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_AVX2_INTRINSICS)
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
            double rdotv_avx2(const ae_int_t n,
         
     | 
| 
      
 50 
     | 
    
         
            +
                /* Real    */ const double* __restrict x,
         
     | 
| 
      
 51 
     | 
    
         
            +
                /* Real    */ const double* __restrict y,
         
     | 
| 
      
 52 
     | 
    
         
            +
                const ae_state* __restrict _state)
         
     | 
| 
      
 53 
     | 
    
         
            +
            {
         
     | 
| 
      
 54 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 57 
     | 
    
         
            +
                const ae_int_t fmaLen = (avx2len >> 2) << 2;
         
     | 
| 
      
 58 
     | 
    
         
            +
                const __m256d* __restrict pX = (const __m256d*)(x);
         
     | 
| 
      
 59 
     | 
    
         
            +
                const __m256d* __restrict pY = (const __m256d*)(y);
         
     | 
| 
      
 60 
     | 
    
         
            +
                __m256d ans;
         
     | 
| 
      
 61 
     | 
    
         
            +
                if(fmaLen >= 4) {
         
     | 
| 
      
 62 
     | 
    
         
            +
                    __m256d fmaUnroll[4];
         
     | 
| 
      
 63 
     | 
    
         
            +
                    fmaUnroll[0] = _mm256_mul_pd(pX[0], pY[0]);
         
     | 
| 
      
 64 
     | 
    
         
            +
                    fmaUnroll[1] = _mm256_mul_pd(pX[1], pY[1]);
         
     | 
| 
      
 65 
     | 
    
         
            +
                    fmaUnroll[2] = _mm256_mul_pd(pX[2], pY[2]);
         
     | 
| 
      
 66 
     | 
    
         
            +
                    fmaUnroll[3] = _mm256_mul_pd(pX[3], pY[3]);
         
     | 
| 
      
 67 
     | 
    
         
            +
                    for(i=4; i<fmaLen; i+=4) {
         
     | 
| 
      
 68 
     | 
    
         
            +
                        fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
         
     | 
| 
      
 69 
     | 
    
         
            +
                        fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
         
     | 
| 
      
 70 
     | 
    
         
            +
                        fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
         
     | 
| 
      
 71 
     | 
    
         
            +
                        fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pY[i+3]), fmaUnroll[3]);
         
     | 
| 
      
 72 
     | 
    
         
            +
                    }
         
     | 
| 
      
 73 
     | 
    
         
            +
                    switch(avx2len-fmaLen) {
         
     | 
| 
      
 74 
     | 
    
         
            +
                    case 3:
         
     | 
| 
      
 75 
     | 
    
         
            +
                        fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
         
     | 
| 
      
 76 
     | 
    
         
            +
                    case 2:
         
     | 
| 
      
 77 
     | 
    
         
            +
                        fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
         
     | 
| 
      
 78 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 79 
     | 
    
         
            +
                        fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
         
     | 
| 
      
 80 
     | 
    
         
            +
                    }
         
     | 
| 
      
 81 
     | 
    
         
            +
                    ans = _mm256_add_pd(
         
     | 
| 
      
 82 
     | 
    
         
            +
                        _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
         
     | 
| 
      
 83 
     | 
    
         
            +
                        _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
         
     | 
| 
      
 84 
     | 
    
         
            +
                }
         
     | 
| 
      
 85 
     | 
    
         
            +
                else {
         
     | 
| 
      
 86 
     | 
    
         
            +
                    ans = _mm256_setzero_pd();
         
     | 
| 
      
 87 
     | 
    
         
            +
                    switch(avx2len) {
         
     | 
| 
      
 88 
     | 
    
         
            +
                    case 3:
         
     | 
| 
      
 89 
     | 
    
         
            +
                        ans = _mm256_mul_pd(pX[2], pY[2]);
         
     | 
| 
      
 90 
     | 
    
         
            +
                    case 2:
         
     | 
| 
      
 91 
     | 
    
         
            +
                        ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pY[1]), ans);
         
     | 
| 
      
 92 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 93 
     | 
    
         
            +
                        ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pY[0]), ans);
         
     | 
| 
      
 94 
     | 
    
         
            +
                    }
         
     | 
| 
      
 95 
     | 
    
         
            +
                }
         
     | 
| 
      
 96 
     | 
    
         
            +
                const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
         
     | 
| 
      
 97 
     | 
    
         
            +
                const double *pComps = (const double*)&s;
         
     | 
| 
      
 98 
     | 
    
         
            +
                double dot =pComps[0] + pComps[1];
         
     | 
| 
      
 99 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 100 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 101 
     | 
    
         
            +
                case 1: {
         
     | 
| 
      
 102 
     | 
    
         
            +
                    dot += x[tail]*y[tail];
         
     | 
| 
      
 103 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 104 
     | 
    
         
            +
                }
         
     | 
| 
      
 105 
     | 
    
         
            +
                case 2: {
         
     | 
| 
      
 106 
     | 
    
         
            +
                    dot += x[tail+0]*y[tail+0];
         
     | 
| 
      
 107 
     | 
    
         
            +
                    dot += x[tail+1]*y[tail+1];
         
     | 
| 
      
 108 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 109 
     | 
    
         
            +
                }
         
     | 
| 
      
 110 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 111 
     | 
    
         
            +
                    dot += x[tail+0]*y[tail+0];
         
     | 
| 
      
 112 
     | 
    
         
            +
                    dot += x[tail+1]*y[tail+1];
         
     | 
| 
      
 113 
     | 
    
         
            +
                    dot += x[tail+2]*y[tail+2];
         
     | 
| 
      
 114 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 115 
     | 
    
         
            +
                }
         
     | 
| 
      
 116 
     | 
    
         
            +
                }
         
     | 
| 
      
 117 
     | 
    
         
            +
                return dot;
         
     | 
| 
      
 118 
     | 
    
         
            +
            }
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
            double rdotv2_avx2(const ae_int_t n,
         
     | 
| 
      
 121 
     | 
    
         
            +
                /* Real    */ const double* __restrict x,
         
     | 
| 
      
 122 
     | 
    
         
            +
                const ae_state* __restrict _state)
         
     | 
| 
      
 123 
     | 
    
         
            +
            {
         
     | 
| 
      
 124 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 127 
     | 
    
         
            +
                const ae_int_t fmaLen = (avx2len >> 2) << 2;
         
     | 
| 
      
 128 
     | 
    
         
            +
                const __m256d* __restrict pX = (const __m256d*)(x);
         
     | 
| 
      
 129 
     | 
    
         
            +
                __m256d ans;
         
     | 
| 
      
 130 
     | 
    
         
            +
                if(fmaLen >= 4) {
         
     | 
| 
      
 131 
     | 
    
         
            +
                    //TODO: this can be further unrolled to 8 because AVX(2) provides 16 registers
         
     | 
| 
      
 132 
     | 
    
         
            +
                    __m256d fmaUnroll[4];
         
     | 
| 
      
 133 
     | 
    
         
            +
                    fmaUnroll[0] = _mm256_mul_pd(pX[0], pX[0]);
         
     | 
| 
      
 134 
     | 
    
         
            +
                    fmaUnroll[1] = _mm256_mul_pd(pX[1], pX[1]);
         
     | 
| 
      
 135 
     | 
    
         
            +
                    fmaUnroll[2] = _mm256_mul_pd(pX[2], pX[2]);
         
     | 
| 
      
 136 
     | 
    
         
            +
                    fmaUnroll[3] = _mm256_mul_pd(pX[3], pX[3]);
         
     | 
| 
      
 137 
     | 
    
         
            +
                    for(i=4; i<fmaLen; i+=4) {
         
     | 
| 
      
 138 
     | 
    
         
            +
                        fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
         
     | 
| 
      
 139 
     | 
    
         
            +
                        fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
         
     | 
| 
      
 140 
     | 
    
         
            +
                        fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
         
     | 
| 
      
 141 
     | 
    
         
            +
                        fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pX[i+3]), fmaUnroll[3]);
         
     | 
| 
      
 142 
     | 
    
         
            +
                    }
         
     | 
| 
      
 143 
     | 
    
         
            +
                    switch(avx2len-fmaLen) {
         
     | 
| 
      
 144 
     | 
    
         
            +
                    case 3:
         
     | 
| 
      
 145 
     | 
    
         
            +
                        fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
         
     | 
| 
      
 146 
     | 
    
         
            +
                    case 2:
         
     | 
| 
      
 147 
     | 
    
         
            +
                        fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
         
     | 
| 
      
 148 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 149 
     | 
    
         
            +
                        fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
         
     | 
| 
      
 150 
     | 
    
         
            +
                    }
         
     | 
| 
      
 151 
     | 
    
         
            +
                    ans = _mm256_add_pd(
         
     | 
| 
      
 152 
     | 
    
         
            +
                        _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
         
     | 
| 
      
 153 
     | 
    
         
            +
                        _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
         
     | 
| 
      
 154 
     | 
    
         
            +
                }
         
     | 
| 
      
 155 
     | 
    
         
            +
                else {
         
     | 
| 
      
 156 
     | 
    
         
            +
                    ans = _mm256_setzero_pd();
         
     | 
| 
      
 157 
     | 
    
         
            +
                    switch(avx2len) {
         
     | 
| 
      
 158 
     | 
    
         
            +
                    case 3:
         
     | 
| 
      
 159 
     | 
    
         
            +
                        ans = _mm256_mul_pd(pX[2], pX[2]);
         
     | 
| 
      
 160 
     | 
    
         
            +
                    case 2:
         
     | 
| 
      
 161 
     | 
    
         
            +
                        ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pX[1]), ans);
         
     | 
| 
      
 162 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 163 
     | 
    
         
            +
                        ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pX[0]), ans);
         
     | 
| 
      
 164 
     | 
    
         
            +
                    }
         
     | 
| 
      
 165 
     | 
    
         
            +
                }
         
     | 
| 
      
 166 
     | 
    
         
            +
                const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
         
     | 
| 
      
 167 
     | 
    
         
            +
                const double *pComps = (const double*)&s;
         
     | 
| 
      
 168 
     | 
    
         
            +
                double dot =pComps[0] + pComps[1];
         
     | 
| 
      
 169 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 170 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 171 
     | 
    
         
            +
                case 1: {
         
     | 
| 
      
 172 
     | 
    
         
            +
                    dot += x[tail]*x[tail];
         
     | 
| 
      
 173 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 174 
     | 
    
         
            +
                }
         
     | 
| 
      
 175 
     | 
    
         
            +
                case 2: {
         
     | 
| 
      
 176 
     | 
    
         
            +
                    dot += x[tail+0]*x[tail+0];
         
     | 
| 
      
 177 
     | 
    
         
            +
                    dot += x[tail+1]*x[tail+1];
         
     | 
| 
      
 178 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 179 
     | 
    
         
            +
                }
         
     | 
| 
      
 180 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 181 
     | 
    
         
            +
                    dot += x[tail+0]*x[tail+0];
         
     | 
| 
      
 182 
     | 
    
         
            +
                    dot += x[tail+1]*x[tail+1];
         
     | 
| 
      
 183 
     | 
    
         
            +
                    dot += x[tail+2]*x[tail+2];
         
     | 
| 
      
 184 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 185 
     | 
    
         
            +
                }
         
     | 
| 
      
 186 
     | 
    
         
            +
                }
         
     | 
| 
      
 187 
     | 
    
         
            +
                return dot;
         
     | 
| 
      
 188 
     | 
    
         
            +
            }
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
            void rcopyv_avx2(ae_int_t n,
         
     | 
| 
      
 191 
     | 
    
         
            +
                 /* Real    */ const double* __restrict x,
         
     | 
| 
      
 192 
     | 
    
         
            +
                 /* Real    */ double* __restrict y,
         
     | 
| 
      
 193 
     | 
    
         
            +
                 ae_state* __restrict _state)
         
     | 
| 
      
 194 
     | 
    
         
            +
            {
         
     | 
| 
      
 195 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 198 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 199 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(x);
         
     | 
| 
      
 200 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(y);
         
     | 
| 
      
 201 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 202 
     | 
    
         
            +
                    pDest[i] = pSrc[i];
         
     | 
| 
      
 203 
     | 
    
         
            +
                }
         
     | 
| 
      
 204 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 205 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 206 
     | 
    
         
            +
                    *(double*)(pDest+i) = *(const double*)(pSrc+i);
         
     | 
| 
      
 207 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 208 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 209 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
         
     | 
| 
      
 210 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 211 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 212 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
         
     | 
| 
      
 213 
     | 
    
         
            +
                    y[tail+2] = x[tail+2];
         
     | 
| 
      
 214 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 215 
     | 
    
         
            +
                }
         
     | 
| 
      
 216 
     | 
    
         
            +
            }
         
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
      
 218 
     | 
    
         
            +
            void rcopymulv_avx2(const ae_int_t n,
         
     | 
| 
      
 219 
     | 
    
         
            +
                 const double v,
         
     | 
| 
      
 220 
     | 
    
         
            +
                 /* Real    */ const double* __restrict x,
         
     | 
| 
      
 221 
     | 
    
         
            +
                 /* Real    */ double* __restrict y,
         
     | 
| 
      
 222 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 223 
     | 
    
         
            +
            {
         
     | 
| 
      
 224 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 225 
     | 
    
         
            +
                
         
     | 
| 
      
 226 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 227 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 228 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(x);
         
     | 
| 
      
 229 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(y);
         
     | 
| 
      
 230 
     | 
    
         
            +
                const __m256d avx2v = _mm256_set1_pd(v);
         
     | 
| 
      
 231 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 232 
     | 
    
         
            +
                    pDest[i] = _mm256_mul_pd(avx2v, pSrc[i]);
         
     | 
| 
      
 233 
     | 
    
         
            +
                }
         
     | 
| 
      
 234 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 235 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 236 
     | 
    
         
            +
                    *(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
         
     | 
| 
      
 237 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 238 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 239 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
         
     | 
| 
      
 240 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 241 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 242 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
         
     | 
| 
      
 243 
     | 
    
         
            +
                    y[tail+2] = v*x[tail+2];
         
     | 
| 
      
 244 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 245 
     | 
    
         
            +
                }
         
     | 
| 
      
 246 
     | 
    
         
            +
            }
         
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
      
 248 
     | 
    
         
            +
            void icopyv_avx2(const ae_int_t n, const ae_int_t* __restrict x,
         
     | 
| 
      
 249 
     | 
    
         
            +
                            ae_int_t* __restrict y, ae_state* __restrict _state)
         
     | 
| 
      
 250 
     | 
    
         
            +
            {
         
     | 
| 
      
 251 
     | 
    
         
            +
                const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
         
     | 
| 
      
 252 
     | 
    
         
            +
                const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
         
     | 
| 
      
 253 
     | 
    
         
            +
                __m256i *__restrict pDest = (__m256i*)y;
         
     | 
| 
      
 254 
     | 
    
         
            +
                const __m256i* __restrict pSrc = (const __m256i*)x;
         
     | 
| 
      
 255 
     | 
    
         
            +
                const ae_int_t nVec = even>>5;
         
     | 
| 
      
 256 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 257 
     | 
    
         
            +
                for(i=0; i<nVec; i++) {
         
     | 
| 
      
 258 
     | 
    
         
            +
                    pDest[i] = pSrc[i];
         
     | 
| 
      
 259 
     | 
    
         
            +
                }
         
     | 
| 
      
 260 
     | 
    
         
            +
                i = even/sizeof(ae_int_t);
         
     | 
| 
      
 261 
     | 
    
         
            +
                if(tail & 16) {
         
     | 
| 
      
 262 
     | 
    
         
            +
                    *(__m128i*)(y+i) = *(const __m128i*)(x+i);
         
     | 
| 
      
 263 
     | 
    
         
            +
                    i += 16/sizeof(ae_int_t);
         
     | 
| 
      
 264 
     | 
    
         
            +
                }
         
     | 
| 
      
 265 
     | 
    
         
            +
                if(tail & 8) {
         
     | 
| 
      
 266 
     | 
    
         
            +
                    *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
         
     | 
| 
      
 267 
     | 
    
         
            +
                    i += 8/sizeof(ae_int_t);
         
     | 
| 
      
 268 
     | 
    
         
            +
                }
         
     | 
| 
      
 269 
     | 
    
         
            +
                if(tail & 4) {
         
     | 
| 
      
 270 
     | 
    
         
            +
                    *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
         
     | 
| 
      
 271 
     | 
    
         
            +
                }
         
     | 
| 
      
 272 
     | 
    
         
            +
            }
         
     | 
| 
      
 273 
     | 
    
         
            +
             
     | 
| 
      
 274 
     | 
    
         
            +
            void bcopyv_avx2(const ae_int_t n, const ae_bool* __restrict x,
         
     | 
| 
      
 275 
     | 
    
         
            +
                            ae_bool* __restrict y, ae_state* __restrict _state)
         
     | 
| 
      
 276 
     | 
    
         
            +
            {
         
     | 
| 
      
 277 
     | 
    
         
            +
                const ae_int_t tail = n & 31;
         
     | 
| 
      
 278 
     | 
    
         
            +
                const ae_int_t even = n-tail;
         
     | 
| 
      
 279 
     | 
    
         
            +
                __m256i *__restrict pDest = (__m256i*)y;
         
     | 
| 
      
 280 
     | 
    
         
            +
                const __m256i* __restrict pSrc = (const __m256i*)x;
         
     | 
| 
      
 281 
     | 
    
         
            +
                const ae_int_t nVec = even>>5;
         
     | 
| 
      
 282 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 283 
     | 
    
         
            +
                for(i=0; i<nVec; i++) {
         
     | 
| 
      
 284 
     | 
    
         
            +
                    pDest[i] = pSrc[i];
         
     | 
| 
      
 285 
     | 
    
         
            +
                }
         
     | 
| 
      
 286 
     | 
    
         
            +
                i = even;
         
     | 
| 
      
 287 
     | 
    
         
            +
                if(tail & 16) {
         
     | 
| 
      
 288 
     | 
    
         
            +
                    *(__m128i*)(y+i) = *(const __m128i*)(x+i);
         
     | 
| 
      
 289 
     | 
    
         
            +
                    i += 16;
         
     | 
| 
      
 290 
     | 
    
         
            +
                }
         
     | 
| 
      
 291 
     | 
    
         
            +
                if(tail & 8) {
         
     | 
| 
      
 292 
     | 
    
         
            +
                    *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
         
     | 
| 
      
 293 
     | 
    
         
            +
                    i += 8;
         
     | 
| 
      
 294 
     | 
    
         
            +
                }
         
     | 
| 
      
 295 
     | 
    
         
            +
                if(tail & 4) {
         
     | 
| 
      
 296 
     | 
    
         
            +
                    *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
         
     | 
| 
      
 297 
     | 
    
         
            +
                    i += 4;
         
     | 
| 
      
 298 
     | 
    
         
            +
                }
         
     | 
| 
      
 299 
     | 
    
         
            +
                if(tail & 2) {
         
     | 
| 
      
 300 
     | 
    
         
            +
                    *(y+i+0) = *(x+i+0);
         
     | 
| 
      
 301 
     | 
    
         
            +
                    *(y+i+1) = *(x+i+1);
         
     | 
| 
      
 302 
     | 
    
         
            +
                    i += 2;
         
     | 
| 
      
 303 
     | 
    
         
            +
                }
         
     | 
| 
      
 304 
     | 
    
         
            +
                if(tail & 1) {
         
     | 
| 
      
 305 
     | 
    
         
            +
                    *(y+i) = *(x+i);
         
     | 
| 
      
 306 
     | 
    
         
            +
                }
         
     | 
| 
      
 307 
     | 
    
         
            +
            }
         
     | 
| 
      
 308 
     | 
    
         
            +
             
     | 
| 
      
 309 
     | 
    
         
            +
            void rsetv_avx2(const ae_int_t n,
         
     | 
| 
      
 310 
     | 
    
         
            +
                 const double v,
         
     | 
| 
      
 311 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 312 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 313 
     | 
    
         
            +
            {
         
     | 
| 
      
 314 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 315 
     | 
    
         
            +
                
         
     | 
| 
      
 316 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 317 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 318 
     | 
    
         
            +
                const __m256d avx2v = _mm256_set1_pd(v);
         
     | 
| 
      
 319 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 320 
     | 
    
         
            +
                    pDest[i] = avx2v;
         
     | 
| 
      
 321 
     | 
    
         
            +
                }
         
     | 
| 
      
 322 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 323 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 324 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 325 
     | 
    
         
            +
                    *(double*)(pDest+i) = v;
         
     | 
| 
      
 326 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 327 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 328 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
         
     | 
| 
      
 329 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 330 
     | 
    
         
            +
                case 3: 
         
     | 
| 
      
 331 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
         
     | 
| 
      
 332 
     | 
    
         
            +
                    x[tail+2] = v;
         
     | 
| 
      
 333 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 334 
     | 
    
         
            +
                }
         
     | 
| 
      
 335 
     | 
    
         
            +
            }
         
     | 
| 
      
 336 
     | 
    
         
            +
             
     | 
| 
      
 337 
     | 
    
         
            +
            void rsetvx_avx2(const ae_int_t n, const double v, double* __restrict x,
         
     | 
| 
      
 338 
     | 
    
         
            +
                const ae_state* __restrict _state)
         
     | 
| 
      
 339 
     | 
    
         
            +
            {
         
     | 
| 
      
 340 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 341 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 342 
     | 
    
         
            +
                {
         
     | 
| 
      
 343 
     | 
    
         
            +
                    ae_int_t j;
         
     | 
| 
      
 344 
     | 
    
         
            +
                    for(j=0; j<=n-1; j++)
         
     | 
| 
      
 345 
     | 
    
         
            +
                        x[j] = v;
         
     | 
| 
      
 346 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 347 
     | 
    
         
            +
                }
         
     | 
| 
      
 348 
     | 
    
         
            +
                switch(unal)
         
     | 
| 
      
 349 
     | 
    
         
            +
                {
         
     | 
| 
      
 350 
     | 
    
         
            +
                    case 0:
         
     | 
| 
      
 351 
     | 
    
         
            +
                        rsetv_avx2(n, v, x, _state);
         
     | 
| 
      
 352 
     | 
    
         
            +
                        return;
         
     | 
| 
      
 353 
     | 
    
         
            +
                    case 8:
         
     | 
| 
      
 354 
     | 
    
         
            +
                        x[2] = v;
         
     | 
| 
      
 355 
     | 
    
         
            +
                    case 16:
         
     | 
| 
      
 356 
     | 
    
         
            +
                        x[1] = v;
         
     | 
| 
      
 357 
     | 
    
         
            +
                    case 24:
         
     | 
| 
      
 358 
     | 
    
         
            +
                    {
         
     | 
| 
      
 359 
     | 
    
         
            +
                        x[0] = v;
         
     | 
| 
      
 360 
     | 
    
         
            +
                        const ptrdiff_t nDone = 4-(unal>>3);
         
     | 
| 
      
 361 
     | 
    
         
            +
                        rsetv_avx2(n-nDone, v, x+nDone, _state);
         
     | 
| 
      
 362 
     | 
    
         
            +
                        return;
         
     | 
| 
      
 363 
     | 
    
         
            +
                    }
         
     | 
| 
      
 364 
     | 
    
         
            +
                }
         
     | 
| 
      
 365 
     | 
    
         
            +
            }
         
     | 
| 
      
 366 
     | 
    
         
            +
             
     | 
| 
      
 367 
     | 
    
         
            +
            void isetv_avx2(const ae_int_t n, const ae_int_t v,
         
     | 
| 
      
 368 
     | 
    
         
            +
                ae_int_t* __restrict x, ae_state* __restrict _state)
         
     | 
| 
      
 369 
     | 
    
         
            +
            {
         
     | 
| 
      
 370 
     | 
    
         
            +
                const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
         
     | 
| 
      
 371 
     | 
    
         
            +
                const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
         
     | 
| 
      
 372 
     | 
    
         
            +
                __m256i *__restrict pDest = (__m256i*)x;
         
     | 
| 
      
 373 
     | 
    
         
            +
                const __m256i avx2v = ((sizeof(v) == 4) ? _mm256_set1_epi32((ae_int32_t)v) : _mm256_set1_epi64x(v));
         
     | 
| 
      
 374 
     | 
    
         
            +
                const ae_int_t nVec = even>>5;
         
     | 
| 
      
 375 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 376 
     | 
    
         
            +
                for(i=0; i<nVec; i++) {
         
     | 
| 
      
 377 
     | 
    
         
            +
                    pDest[i] = avx2v;
         
     | 
| 
      
 378 
     | 
    
         
            +
                }
         
     | 
| 
      
 379 
     | 
    
         
            +
                memmove(pDest+i, &avx2v, tail);
         
     | 
| 
      
 380 
     | 
    
         
            +
            }
         
     | 
| 
      
 381 
     | 
    
         
            +
             
     | 
| 
      
 382 
     | 
    
         
            +
            void bsetv_avx2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
         
     | 
| 
      
 383 
     | 
    
         
            +
                ae_state* __restrict _state)
         
     | 
| 
      
 384 
     | 
    
         
            +
            {
         
     | 
| 
      
 385 
     | 
    
         
            +
                const ae_int_t tail = n & 31;
         
     | 
| 
      
 386 
     | 
    
         
            +
                const ae_int_t even = n-tail;
         
     | 
| 
      
 387 
     | 
    
         
            +
                __m256i *__restrict pDest = (__m256i*)x;
         
     | 
| 
      
 388 
     | 
    
         
            +
                const __m256i avx2v = _mm256_set1_epi8(v);
         
     | 
| 
      
 389 
     | 
    
         
            +
                const ae_int_t nVec = even>>5;
         
     | 
| 
      
 390 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 391 
     | 
    
         
            +
                for(i=0; i<nVec; i++) {
         
     | 
| 
      
 392 
     | 
    
         
            +
                    pDest[i] = avx2v;
         
     | 
| 
      
 393 
     | 
    
         
            +
                }
         
     | 
| 
      
 394 
     | 
    
         
            +
                /* _mm256_extracti128_si256() has a too high latency on latest processors (Skylake+) */
         
     | 
| 
      
 395 
     | 
    
         
            +
                memset(x+even, v, tail);
         
     | 
| 
      
 396 
     | 
    
         
            +
            }
         
     | 
| 
      
 397 
     | 
    
         
            +
             
     | 
| 
      
 398 
     | 
    
         
            +
            void rmulv_avx2(const ae_int_t n, const double v, double* __restrict x,
         
     | 
| 
      
 399 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 400 
     | 
    
         
            +
            {
         
     | 
| 
      
 401 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 402 
     | 
    
         
            +
                
         
     | 
| 
      
 403 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 404 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 405 
     | 
    
         
            +
                const __m256d avx2v = _mm256_set1_pd(v);
         
     | 
| 
      
 406 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 407 
     | 
    
         
            +
                    pDest[i] = _mm256_mul_pd(avx2v, pDest[i]);
         
     | 
| 
      
 408 
     | 
    
         
            +
                }
         
     | 
| 
      
 409 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 410 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 411 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 412 
     | 
    
         
            +
                    *(double*)(pDest+i) = v * (*(const double*)(pDest+i));
         
     | 
| 
      
 413 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 414 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 415 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 416 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 417 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 418 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 419 
     | 
    
         
            +
                    x[tail+2] *= v;
         
     | 
| 
      
 420 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 421 
     | 
    
         
            +
                }
         
     | 
| 
      
 422 
     | 
    
         
            +
            }
         
     | 
| 
      
 423 
     | 
    
         
            +
             
     | 
| 
      
 424 
     | 
    
         
            +
            void rsqrtv_avx2(const ae_int_t n, double* __restrict x, const ae_state* __restrict _state)
         
     | 
| 
      
 425 
     | 
    
         
            +
            {
         
     | 
| 
      
 426 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 427 
     | 
    
         
            +
                
         
     | 
| 
      
 428 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 429 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 430 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 431 
     | 
    
         
            +
                for(i=0; i<avx2len; i++)
         
     | 
| 
      
 432 
     | 
    
         
            +
                    pDest[i] = _mm256_sqrt_pd(pDest[i]);
         
     | 
| 
      
 433 
     | 
    
         
            +
                for(i=tail; i<n; i++)
         
     | 
| 
      
 434 
     | 
    
         
            +
                    x[i] = sqrt(x[i]);
         
     | 
| 
      
 435 
     | 
    
         
            +
            }
         
     | 
| 
      
 436 
     | 
    
         
            +
             
     | 
| 
      
 437 
     | 
    
         
            +
            void rmulvx_avx2(const ae_int_t n, const double v, double* __restrict x,
         
     | 
| 
      
 438 
     | 
    
         
            +
                const ae_state* __restrict _state)
         
     | 
| 
      
 439 
     | 
    
         
            +
            {
         
     | 
| 
      
 440 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 441 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 442 
     | 
    
         
            +
                {
         
     | 
| 
      
 443 
     | 
    
         
            +
                    ae_int_t i;
         
     | 
| 
      
 444 
     | 
    
         
            +
                    for(i=0; i<=n-1; i++)
         
     | 
| 
      
 445 
     | 
    
         
            +
                        x[i] *= v;
         
     | 
| 
      
 446 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 447 
     | 
    
         
            +
                }
         
     | 
| 
      
 448 
     | 
    
         
            +
                switch(unal) {
         
     | 
| 
      
 449 
     | 
    
         
            +
                case 0:
         
     | 
| 
      
 450 
     | 
    
         
            +
                    rmulv_avx2(n, v, x, _state);
         
     | 
| 
      
 451 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 452 
     | 
    
         
            +
                case 8:
         
     | 
| 
      
 453 
     | 
    
         
            +
                    x[2] = v*x[2];
         
     | 
| 
      
 454 
     | 
    
         
            +
                case 16:
         
     | 
| 
      
 455 
     | 
    
         
            +
                    x[1] = v*x[1];
         
     | 
| 
      
 456 
     | 
    
         
            +
                case 24: {
         
     | 
| 
      
 457 
     | 
    
         
            +
                    x[0] = v*x[0];
         
     | 
| 
      
 458 
     | 
    
         
            +
                    const ptrdiff_t nDone = 4-(unal>>3);
         
     | 
| 
      
 459 
     | 
    
         
            +
                    rmulv_avx2(n-nDone, v, x+nDone, _state);
         
     | 
| 
      
 460 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 461 
     | 
    
         
            +
                }
         
     | 
| 
      
 462 
     | 
    
         
            +
                }
         
     | 
| 
      
 463 
     | 
    
         
            +
            }
         
     | 
| 
      
 464 
     | 
    
         
            +
             
     | 
| 
      
 465 
     | 
    
         
            +
            void raddv_avx2(const ae_int_t n,
         
     | 
| 
      
 466 
     | 
    
         
            +
                 const double alpha,
         
     | 
| 
      
 467 
     | 
    
         
            +
                 /* Real    */ const double* __restrict y,
         
     | 
| 
      
 468 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 469 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 470 
     | 
    
         
            +
            {
         
     | 
| 
      
 471 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 472 
     | 
    
         
            +
                
         
     | 
| 
      
 473 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 474 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(y);
         
     | 
| 
      
 475 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 476 
     | 
    
         
            +
                const __m256d avx2alpha = _mm256_set1_pd(alpha);
         
     | 
| 
      
 477 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 478 
     | 
    
         
            +
                    pDest[i] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, pSrc[i]), pDest[i]);
         
     | 
| 
      
 479 
     | 
    
         
            +
                }
         
     | 
| 
      
 480 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 481 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 482 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 483 
     | 
    
         
            +
                    *(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
         
     | 
| 
      
 484 
     | 
    
         
            +
                        + (*(const double*)(pDest+i));
         
     | 
| 
      
 485 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 486 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 487 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
         
     | 
| 
      
 488 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 489 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 490 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
         
     | 
| 
      
 491 
     | 
    
         
            +
                    x[tail+2] += alpha*y[tail+2];
         
     | 
| 
      
 492 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 493 
     | 
    
         
            +
                }
         
     | 
| 
      
 494 
     | 
    
         
            +
            }
         
     | 
| 
      
 495 
     | 
    
         
            +
             
     | 
| 
      
 496 
     | 
    
         
            +
            void raddvx_avx_xaligned(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
         
     | 
| 
      
 497 
     | 
    
         
            +
            {
         
     | 
| 
      
 498 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 499 
     | 
    
         
            +
                const ae_int_t vecLen = (n>>2)<<2;
         
     | 
| 
      
 500 
     | 
    
         
            +
                const __m256d avx2alpha = _mm256_set1_pd(alpha);
         
     | 
| 
      
 501 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)x;
         
     | 
| 
      
 502 
     | 
    
         
            +
                for(i=0; i<vecLen; i+=4)
         
     | 
| 
      
 503 
     | 
    
         
            +
                {
         
     | 
| 
      
 504 
     | 
    
         
            +
                    const ae_int_t iDest = i>>2;
         
     | 
| 
      
 505 
     | 
    
         
            +
                    pDest[iDest] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, _mm256_loadu_pd(y+i)), pDest[iDest]);
         
     | 
| 
      
 506 
     | 
    
         
            +
                }
         
     | 
| 
      
 507 
     | 
    
         
            +
                switch(n-vecLen) {
         
     | 
| 
      
 508 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 509 
     | 
    
         
            +
                    x[i] += alpha*y[i];
         
     | 
| 
      
 510 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 511 
     | 
    
         
            +
                case 2: {
         
     | 
| 
      
 512 
     | 
    
         
            +
                    const ae_int_t iDest = i>>2;
         
     | 
| 
      
 513 
     | 
    
         
            +
                    *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
         
     | 
| 
      
 514 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 515 
     | 
    
         
            +
                }
         
     | 
| 
      
 516 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 517 
     | 
    
         
            +
                {
         
     | 
| 
      
 518 
     | 
    
         
            +
                    const ae_int_t iDest = i>>2;
         
     | 
| 
      
 519 
     | 
    
         
            +
                    *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
         
     | 
| 
      
 520 
     | 
    
         
            +
                    x[i+2] += alpha*y[i+2];
         
     | 
| 
      
 521 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 522 
     | 
    
         
            +
                }
         
     | 
| 
      
 523 
     | 
    
         
            +
                }
         
     | 
| 
      
 524 
     | 
    
         
            +
            }
         
     | 
| 
      
 525 
     | 
    
         
            +
             
     | 
| 
      
 526 
     | 
    
         
            +
            void raddvx_avx2(const ae_int_t n, const double alpha, const double* __restrict y,
         
     | 
| 
      
 527 
     | 
    
         
            +
                double* __restrict x, ae_state *_state)
         
     | 
| 
      
 528 
     | 
    
         
            +
            {
         
     | 
| 
      
 529 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 530 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 531 
     | 
    
         
            +
                {
         
     | 
| 
      
 532 
     | 
    
         
            +
                    ae_int_t i;
         
     | 
| 
      
 533 
     | 
    
         
            +
                    for(i=0; i<=n-1; i++)
         
     | 
| 
      
 534 
     | 
    
         
            +
                        x[i] += alpha*y[i];
         
     | 
| 
      
 535 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 536 
     | 
    
         
            +
                }
         
     | 
| 
      
 537 
     | 
    
         
            +
                switch(unal)
         
     | 
| 
      
 538 
     | 
    
         
            +
                {
         
     | 
| 
      
 539 
     | 
    
         
            +
                case 0:
         
     | 
| 
      
 540 
     | 
    
         
            +
                    raddvx_avx_xaligned(n, alpha, y, x, _state);
         
     | 
| 
      
 541 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 542 
     | 
    
         
            +
                case 8:
         
     | 
| 
      
 543 
     | 
    
         
            +
                    x[2] += alpha*y[2];
         
     | 
| 
      
 544 
     | 
    
         
            +
                case 16:
         
     | 
| 
      
 545 
     | 
    
         
            +
                    x[1] += alpha*y[1];
         
     | 
| 
      
 546 
     | 
    
         
            +
                case 24:
         
     | 
| 
      
 547 
     | 
    
         
            +
                    {
         
     | 
| 
      
 548 
     | 
    
         
            +
                        x[0] += alpha*y[0];
         
     | 
| 
      
 549 
     | 
    
         
            +
                        const ptrdiff_t nDone = 4-(unal>>3);
         
     | 
| 
      
 550 
     | 
    
         
            +
                        raddvx_avx_xaligned(n-nDone, alpha, y+nDone, x+nDone, _state);
         
     | 
| 
      
 551 
     | 
    
         
            +
                        return;
         
     | 
| 
      
 552 
     | 
    
         
            +
                    }
         
     | 
| 
      
 553 
     | 
    
         
            +
                }
         
     | 
| 
      
 554 
     | 
    
         
            +
            }
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
      
 556 
     | 
    
         
            +
            void rmergemulv_avx2(ae_int_t n,
         
     | 
| 
      
 557 
     | 
    
         
            +
                 /* Real    */ const double* __restrict y,
         
     | 
| 
      
 558 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 559 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 560 
     | 
    
         
            +
            {
         
     | 
| 
      
 561 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 562 
     | 
    
         
            +
             
     | 
| 
      
 563 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 564 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(y);
         
     | 
| 
      
 565 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 566 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 567 
     | 
    
         
            +
                    pDest[i] = _mm256_mul_pd(pSrc[i], pDest[i]);
         
     | 
| 
      
 568 
     | 
    
         
            +
                }
         
     | 
| 
      
 569 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 570 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 571 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 572 
     | 
    
         
            +
                    *(double*)(pDest+i) = *(const double*)(pSrc+i)
         
     | 
| 
      
 573 
     | 
    
         
            +
                        * (*(const double*)(pDest+i));
         
     | 
| 
      
 574 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 575 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 576 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(
         
     | 
| 
      
 577 
     | 
    
         
            +
                        *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 578 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 579 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 580 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_mul_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 581 
     | 
    
         
            +
                    ((double*)(pDest+i))[2] *= ((const double*)(pSrc+i))[2];
         
     | 
| 
      
 582 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 583 
     | 
    
         
            +
                }
         
     | 
| 
      
 584 
     | 
    
         
            +
                }
         
     | 
| 
      
 585 
     | 
    
         
            +
            }
         
     | 
| 
      
 586 
     | 
    
         
            +
             
     | 
| 
      
 587 
     | 
    
         
            +
            void rmergedivv_avx2(ae_int_t n,
         
     | 
| 
      
 588 
     | 
    
         
            +
                 /* Real    */ const double* __restrict y,
         
     | 
| 
      
 589 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 590 
     | 
    
         
            +
                 const ae_state* __restrict _state)
         
     | 
| 
      
 591 
     | 
    
         
            +
            {
         
     | 
| 
      
 592 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 593 
     | 
    
         
            +
             
     | 
| 
      
 594 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 595 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(y);
         
     | 
| 
      
 596 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 597 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 598 
     | 
    
         
            +
                    pDest[i] = _mm256_div_pd(pDest[i], pSrc[i]);
         
     | 
| 
      
 599 
     | 
    
         
            +
                }
         
     | 
| 
      
 600 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 601 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 602 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 603 
     | 
    
         
            +
                    *(double*)(pDest+i) = (*(const double*)(pDest+i)) / (*(const double*)(pSrc+i));
         
     | 
| 
      
 604 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 605 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 606 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i),  *(const __m128d*)(pSrc+i));
         
     | 
| 
      
 607 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 608 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 609 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i), *(const __m128d*)(pSrc+i));
         
     | 
| 
      
 610 
     | 
    
         
            +
                    ((double*)(pDest+i))[2] /= ((const double*)(pSrc+i))[2];
         
     | 
| 
      
 611 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 612 
     | 
    
         
            +
                }
         
     | 
| 
      
 613 
     | 
    
         
            +
                }
         
     | 
| 
      
 614 
     | 
    
         
            +
            }
         
     | 
| 
      
 615 
     | 
    
         
            +
             
     | 
| 
      
 616 
     | 
    
         
            +
            void rmergemaxv_avx2(ae_int_t n,
         
     | 
| 
      
 617 
     | 
    
         
            +
                 /* Real    */ const double* __restrict y,
         
     | 
| 
      
 618 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 619 
     | 
    
         
            +
                 ae_state* __restrict _state)
         
     | 
| 
      
 620 
     | 
    
         
            +
            {
         
     | 
| 
      
 621 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 622 
     | 
    
         
            +
             
     | 
| 
      
 623 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 624 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(y);
         
     | 
| 
      
 625 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 626 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 627 
     | 
    
         
            +
                    pDest[i] = _mm256_max_pd(pSrc[i], pDest[i]);
         
     | 
| 
      
 628 
     | 
    
         
            +
                }
         
     | 
| 
      
 629 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 630 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 631 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 632 
     | 
    
         
            +
                    *(double*)(pDest+i) = *(const double*)(pSrc+i)>*(const double*)(pDest+i) ? *(const double*)(pSrc+i) : *(const double*)(pDest+i);
         
     | 
| 
      
 633 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 634 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 635 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 636 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 637 
     | 
    
         
            +
                case 3:
         
     | 
| 
      
 638 
     | 
    
         
            +
                {
         
     | 
| 
      
 639 
     | 
    
         
            +
                    double  s2 = ((const double*)(pSrc+i))[2];
         
     | 
| 
      
 640 
     | 
    
         
            +
                    double *d2 = ((double*)(pDest+i))+2;
         
     | 
| 
      
 641 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 642 
     | 
    
         
            +
                    *d2 = s2>*d2 ? s2 : *d2;
         
     | 
| 
      
 643 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 644 
     | 
    
         
            +
                }
         
     | 
| 
      
 645 
     | 
    
         
            +
                }
         
     | 
| 
      
 646 
     | 
    
         
            +
            }
         
     | 
| 
      
 647 
     | 
    
         
            +
             
     | 
| 
      
 648 
     | 
    
         
            +
            void rmergeminv_avx2(ae_int_t n,
         
     | 
| 
      
 649 
     | 
    
         
            +
                 /* Real    */ const double* __restrict y,
         
     | 
| 
      
 650 
     | 
    
         
            +
                 /* Real    */ double* __restrict x,
         
     | 
| 
      
 651 
     | 
    
         
            +
                 ae_state* __restrict _state)
         
     | 
| 
      
 652 
     | 
    
         
            +
            {
         
     | 
| 
      
 653 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 654 
     | 
    
         
            +
             
     | 
| 
      
 655 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 656 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(y);
         
     | 
| 
      
 657 
     | 
    
         
            +
                __m256d* __restrict pDest = (__m256d*)(x);
         
     | 
| 
      
 658 
     | 
    
         
            +
                for(i=0; i<avx2len; i++) {
         
     | 
| 
      
 659 
     | 
    
         
            +
                    pDest[i] = _mm256_min_pd(pSrc[i], pDest[i]);
         
     | 
| 
      
 660 
     | 
    
         
            +
                }
         
     | 
| 
      
 661 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 662 
     | 
    
         
            +
                switch(n-tail) {
         
     | 
| 
      
 663 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 664 
     | 
    
         
            +
                    *(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
         
     | 
| 
      
 665 
     | 
    
         
            +
                        *(const double*)(pDest+i), _state);
         
     | 
| 
      
 666 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 667 
     | 
    
         
            +
                case 2:
         
     | 
| 
      
 668 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_min_pd(
         
     | 
| 
      
 669 
     | 
    
         
            +
                        *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 670 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 671 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 672 
     | 
    
         
            +
                    double  s2 = ((const double*)(pSrc+i))[2];
         
     | 
| 
      
 673 
     | 
    
         
            +
                    double *d2 = ((double*)(pDest+i))+2;
         
     | 
| 
      
 674 
     | 
    
         
            +
                    *(__m128d*)(pDest+i) = _mm_min_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
         
     | 
| 
      
 675 
     | 
    
         
            +
                    *d2 = s2<*d2 ? s2 : *d2;
         
     | 
| 
      
 676 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 677 
     | 
    
         
            +
                }
         
     | 
| 
      
 678 
     | 
    
         
            +
                }
         
     | 
| 
      
 679 
     | 
    
         
            +
            }
         
     | 
| 
      
 680 
     | 
    
         
            +
             
     | 
| 
      
 681 
     | 
    
         
            +
            double rmaxv_avx2(ae_int_t n, /* Real    */ const double* __restrict x, ae_state* __restrict _state)
         
     | 
| 
      
 682 
     | 
    
         
            +
            {
         
     | 
| 
      
 683 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 684 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 685 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 686 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(x);
         
     | 
| 
      
 687 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 688 
     | 
    
         
            +
                {
         
     | 
| 
      
 689 
     | 
    
         
            +
                    double result;
         
     | 
| 
      
 690 
     | 
    
         
            +
                    if(n == 0)
         
     | 
| 
      
 691 
     | 
    
         
            +
                        return 0.0;
         
     | 
| 
      
 692 
     | 
    
         
            +
                    result = x[0];
         
     | 
| 
      
 693 
     | 
    
         
            +
                    for(i=1; i<=n-1; i++)
         
     | 
| 
      
 694 
     | 
    
         
            +
                    {
         
     | 
| 
      
 695 
     | 
    
         
            +
                        double v = x[i];
         
     | 
| 
      
 696 
     | 
    
         
            +
                        if( v>result )
         
     | 
| 
      
 697 
     | 
    
         
            +
                            result = v;
         
     | 
| 
      
 698 
     | 
    
         
            +
                    }
         
     | 
| 
      
 699 
     | 
    
         
            +
                    return result;
         
     | 
| 
      
 700 
     | 
    
         
            +
                }
         
     | 
| 
      
 701 
     | 
    
         
            +
                __m256d curMax = pSrc[0];
         
     | 
| 
      
 702 
     | 
    
         
            +
                for(i=1; i<avx2len; i++) {
         
     | 
| 
      
 703 
     | 
    
         
            +
                    curMax = _mm256_max_pd(curMax, pSrc[i]);
         
     | 
| 
      
 704 
     | 
    
         
            +
                }
         
     | 
| 
      
 705 
     | 
    
         
            +
                const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
         
     | 
| 
      
 706 
     | 
    
         
            +
                const double *pComps = (const double *)&sseMax;
         
     | 
| 
      
 707 
     | 
    
         
            +
                double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
         
     | 
| 
      
 708 
     | 
    
         
            +
                const double *p_tail = (const double*)(pSrc+i);
         
     | 
| 
      
 709 
     | 
    
         
            +
                switch(n-tail)
         
     | 
| 
      
 710 
     | 
    
         
            +
                {
         
     | 
| 
      
 711 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 712 
     | 
    
         
            +
                    {
         
     | 
| 
      
 713 
     | 
    
         
            +
                        dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
         
     | 
| 
      
 714 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 715 
     | 
    
         
            +
                    }
         
     | 
| 
      
 716 
     | 
    
         
            +
                    case 2: {
         
     | 
| 
      
 717 
     | 
    
         
            +
                        dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
         
     | 
| 
      
 718 
     | 
    
         
            +
                        dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
         
     | 
| 
      
 719 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 720 
     | 
    
         
            +
                    }
         
     | 
| 
      
 721 
     | 
    
         
            +
                    case 3: {
         
     | 
| 
      
 722 
     | 
    
         
            +
                        dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
         
     | 
| 
      
 723 
     | 
    
         
            +
                        dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
         
     | 
| 
      
 724 
     | 
    
         
            +
                        dMax = p_tail[2]>dMax ? p_tail[2] : dMax;
         
     | 
| 
      
 725 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 726 
     | 
    
         
            +
                    }
         
     | 
| 
      
 727 
     | 
    
         
            +
                }
         
     | 
| 
      
 728 
     | 
    
         
            +
                return dMax;
         
     | 
| 
      
 729 
     | 
    
         
            +
            }
         
     | 
| 
      
 730 
     | 
    
         
            +
             
     | 
| 
      
 731 
     | 
    
         
            +
            double rmaxabsv_avx2(ae_int_t n, /* Real    */ const double* __restrict x, ae_state* __restrict _state)
         
     | 
| 
      
 732 
     | 
    
         
            +
            {
         
     | 
| 
      
 733 
     | 
    
         
            +
                const __m256d signMask = _mm256_set1_pd(-0.); // -0. = 1 << 63
         
     | 
| 
      
 734 
     | 
    
         
            +
                const ae_int_t avx2len = n>>2;
         
     | 
| 
      
 735 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)(x);
         
     | 
| 
      
 736 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 737 
     | 
    
         
            +
                {
         
     | 
| 
      
 738 
     | 
    
         
            +
                    double result;
         
     | 
| 
      
 739 
     | 
    
         
            +
                    ae_int_t i;
         
     | 
| 
      
 740 
     | 
    
         
            +
                    result = 0;
         
     | 
| 
      
 741 
     | 
    
         
            +
                    for(i=0; i<=n-1; i++)
         
     | 
| 
      
 742 
     | 
    
         
            +
                    {
         
     | 
| 
      
 743 
     | 
    
         
            +
                        double v = fabs(x[i]);
         
     | 
| 
      
 744 
     | 
    
         
            +
                        if( v>result )
         
     | 
| 
      
 745 
     | 
    
         
            +
                            result = v;
         
     | 
| 
      
 746 
     | 
    
         
            +
                    }
         
     | 
| 
      
 747 
     | 
    
         
            +
                    return result;
         
     | 
| 
      
 748 
     | 
    
         
            +
                }
         
     | 
| 
      
 749 
     | 
    
         
            +
                __m256d curMax = _mm256_andnot_pd(signMask, pSrc[0]); // abs
         
     | 
| 
      
 750 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 751 
     | 
    
         
            +
                for(i=1; i<avx2len; i++) {
         
     | 
| 
      
 752 
     | 
    
         
            +
                    curMax = _mm256_max_pd(curMax, _mm256_andnot_pd(signMask, pSrc[i])); // abs
         
     | 
| 
      
 753 
     | 
    
         
            +
                }
         
     | 
| 
      
 754 
     | 
    
         
            +
                const ae_int_t tail = avx2len<<2;
         
     | 
| 
      
 755 
     | 
    
         
            +
                const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
         
     | 
| 
      
 756 
     | 
    
         
            +
                const double *p_tail = (const double*)(pSrc+i);
         
     | 
| 
      
 757 
     | 
    
         
            +
                const double *pComps = (const double *)&sseMax;
         
     | 
| 
      
 758 
     | 
    
         
            +
                double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
         
     | 
| 
      
 759 
     | 
    
         
            +
                switch(n-tail)
         
     | 
| 
      
 760 
     | 
    
         
            +
                {
         
     | 
| 
      
 761 
     | 
    
         
            +
                    case 1:
         
     | 
| 
      
 762 
     | 
    
         
            +
                    {
         
     | 
| 
      
 763 
     | 
    
         
            +
                        double a0 = fabs(p_tail[0]);
         
     | 
| 
      
 764 
     | 
    
         
            +
                        dMax = a0>dMax ? a0 : dMax;
         
     | 
| 
      
 765 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 766 
     | 
    
         
            +
                    }
         
     | 
| 
      
 767 
     | 
    
         
            +
                    case 2:
         
     | 
| 
      
 768 
     | 
    
         
            +
                    {
         
     | 
| 
      
 769 
     | 
    
         
            +
                        double a0 = fabs(p_tail[0]);
         
     | 
| 
      
 770 
     | 
    
         
            +
                        double a1 = fabs(p_tail[1]);
         
     | 
| 
      
 771 
     | 
    
         
            +
                        dMax = a0>dMax ? a0 : dMax;
         
     | 
| 
      
 772 
     | 
    
         
            +
                        dMax = a1>dMax ? a1 : dMax;
         
     | 
| 
      
 773 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 774 
     | 
    
         
            +
                    }
         
     | 
| 
      
 775 
     | 
    
         
            +
                    case 3:
         
     | 
| 
      
 776 
     | 
    
         
            +
                    {
         
     | 
| 
      
 777 
     | 
    
         
            +
                        double a0 = fabs(p_tail[0]);
         
     | 
| 
      
 778 
     | 
    
         
            +
                        double a1 = fabs(p_tail[1]);
         
     | 
| 
      
 779 
     | 
    
         
            +
                        double a2 = fabs(p_tail[2]);
         
     | 
| 
      
 780 
     | 
    
         
            +
                        dMax = a0>dMax ? a0 : dMax;
         
     | 
| 
      
 781 
     | 
    
         
            +
                        dMax = a1>dMax ? a1 : dMax;
         
     | 
| 
      
 782 
     | 
    
         
            +
                        dMax = a2>dMax ? a2 : dMax;
         
     | 
| 
      
 783 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 784 
     | 
    
         
            +
                    }
         
     | 
| 
      
 785 
     | 
    
         
            +
                }
         
     | 
| 
      
 786 
     | 
    
         
            +
                return dMax;
         
     | 
| 
      
 787 
     | 
    
         
            +
            }
         
     | 
| 
      
 788 
     | 
    
         
            +
             
     | 
| 
      
 789 
     | 
    
         
            +
            static void rcopyvx_avx2_xaligned(const ae_int_t n, const double* __restrict x,
         
     | 
| 
      
 790 
     | 
    
         
            +
                double* __restrict y, ae_state *_state)
         
     | 
| 
      
 791 
     | 
    
         
            +
            {
         
     | 
| 
      
 792 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 793 
     | 
    
         
            +
                const ae_int_t vecLen = (n>>2)<<2;
         
     | 
| 
      
 794 
     | 
    
         
            +
                const __m256d* __restrict pSrc = (const __m256d*)x;
         
     | 
| 
      
 795 
     | 
    
         
            +
                for(i=0; i<vecLen; i+=4) {
         
     | 
| 
      
 796 
     | 
    
         
            +
                    const ae_int_t iSrc = i>>2;
         
     | 
| 
      
 797 
     | 
    
         
            +
                    _mm256_storeu_pd(y+i, pSrc[iSrc]);
         
     | 
| 
      
 798 
     | 
    
         
            +
                }
         
     | 
| 
      
 799 
     | 
    
         
            +
                switch(n-vecLen) {
         
     | 
| 
      
 800 
     | 
    
         
            +
                case 1:
         
     | 
| 
      
 801 
     | 
    
         
            +
                    y[i] = x[i];
         
     | 
| 
      
 802 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 803 
     | 
    
         
            +
                case 2: {
         
     | 
| 
      
 804 
     | 
    
         
            +
                    const ae_int_t iSrc = i>>2;
         
     | 
| 
      
 805 
     | 
    
         
            +
                    _mm_storeu_pd(y+i, *(const __m128d*)(pSrc+iSrc));
         
     | 
| 
      
 806 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 807 
     | 
    
         
            +
                }
         
     | 
| 
      
 808 
     | 
    
         
            +
                case 3: {
         
     | 
| 
      
 809 
     | 
    
         
            +
                    const ae_int_t iSrc = i>>2;
         
     | 
| 
      
 810 
     | 
    
         
            +
                    const __m256d t = pSrc[iSrc];
         
     | 
| 
      
 811 
     | 
    
         
            +
                    _mm_storeu_pd(y+i, _mm256_extractf128_pd(t, 0));
         
     | 
| 
      
 812 
     | 
    
         
            +
                    y[i+2] = *(((const double*)&t)+2);
         
     | 
| 
      
 813 
     | 
    
         
            +
                    break;
         
     | 
| 
      
 814 
     | 
    
         
            +
                }
         
     | 
| 
      
 815 
     | 
    
         
            +
                }
         
     | 
| 
      
 816 
     | 
    
         
            +
            }
         
     | 
| 
      
 817 
     | 
    
         
            +
             
     | 
| 
      
 818 
     | 
    
         
            +
            void rcopyvx_avx2(const ae_int_t n, const double* __restrict x,
         
     | 
| 
      
 819 
     | 
    
         
            +
                double* __restrict y, ae_state *_state)
         
     | 
| 
      
 820 
     | 
    
         
            +
            {
         
     | 
| 
      
 821 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 822 
     | 
    
         
            +
                if( n<=4 )
         
     | 
| 
      
 823 
     | 
    
         
            +
                {
         
     | 
| 
      
 824 
     | 
    
         
            +
                    ae_int_t j;
         
     | 
| 
      
 825 
     | 
    
         
            +
                    for(j=0; j<n; j++)
         
     | 
| 
      
 826 
     | 
    
         
            +
                        y[j] = x[j];
         
     | 
| 
      
 827 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 828 
     | 
    
         
            +
                }
         
     | 
| 
      
 829 
     | 
    
         
            +
                switch(unal) {
         
     | 
| 
      
 830 
     | 
    
         
            +
                case 0:
         
     | 
| 
      
 831 
     | 
    
         
            +
                    rcopyvx_avx2_xaligned(n, x, y, _state);
         
     | 
| 
      
 832 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 833 
     | 
    
         
            +
                case 8:
         
     | 
| 
      
 834 
     | 
    
         
            +
                    y[2] = x[2];
         
     | 
| 
      
 835 
     | 
    
         
            +
                case 16:
         
     | 
| 
      
 836 
     | 
    
         
            +
                    y[1] = x[1];
         
     | 
| 
      
 837 
     | 
    
         
            +
                case 24: {
         
     | 
| 
      
 838 
     | 
    
         
            +
                    y[0] = x[0];
         
     | 
| 
      
 839 
     | 
    
         
            +
                    const ptrdiff_t nDone = 4-(unal>>3);
         
     | 
| 
      
 840 
     | 
    
         
            +
                    rcopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
         
     | 
| 
      
 841 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 842 
     | 
    
         
            +
                }
         
     | 
| 
      
 843 
     | 
    
         
            +
                }
         
     | 
| 
      
 844 
     | 
    
         
            +
            }
         
     | 
| 
      
 845 
     | 
    
         
            +
             
     | 
| 
      
 846 
     | 
    
         
            +
            static void icopyvx_avx2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
         
     | 
| 
      
 847 
     | 
    
         
            +
                            ae_int_t* __restrict y, ae_state* __restrict _state)
         
     | 
| 
      
 848 
     | 
    
         
            +
            {
         
     | 
| 
      
 849 
     | 
    
         
            +
                const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
         
     | 
| 
      
 850 
     | 
    
         
            +
                const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
         
     | 
| 
      
 851 
     | 
    
         
            +
                const __m256i* __restrict pSrc = (const __m256i*)x;
         
     | 
| 
      
 852 
     | 
    
         
            +
                const ae_int_t nVec = even>>5;
         
     | 
| 
      
 853 
     | 
    
         
            +
                const ae_int_t shift_by = 3-sizeof(ae_int_t)/8;
         
     | 
| 
      
 854 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 855 
     | 
    
         
            +
                for(i=0; i<nVec; i++) {
         
     | 
| 
      
 856 
     | 
    
         
            +
                    const ae_int_t j = i<<shift_by;
         
     | 
| 
      
 857 
     | 
    
         
            +
                    _mm256_storeu_si256((__m256i*)(y+j), pSrc[i]);
         
     | 
| 
      
 858 
     | 
    
         
            +
                }
         
     | 
| 
      
 859 
     | 
    
         
            +
                i = even/sizeof(ae_int_t);
         
     | 
| 
      
 860 
     | 
    
         
            +
                if(tail & 16) {
         
     | 
| 
      
 861 
     | 
    
         
            +
                    _mm_storeu_si128((__m128i*)(y+i), *(const __m128i*)(x+i));
         
     | 
| 
      
 862 
     | 
    
         
            +
                    i += 16/sizeof(ae_int_t);
         
     | 
| 
      
 863 
     | 
    
         
            +
                }
         
     | 
| 
      
 864 
     | 
    
         
            +
                if(tail & 8) {
         
     | 
| 
      
 865 
     | 
    
         
            +
                    *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
         
     | 
| 
      
 866 
     | 
    
         
            +
                    i += 8/sizeof(ae_int_t);
         
     | 
| 
      
 867 
     | 
    
         
            +
                }
         
     | 
| 
      
 868 
     | 
    
         
            +
                if(tail & 4) {
         
     | 
| 
      
 869 
     | 
    
         
            +
                    *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
         
     | 
| 
      
 870 
     | 
    
         
            +
                }
         
     | 
| 
      
 871 
     | 
    
         
            +
            }
         
     | 
| 
      
 872 
     | 
    
         
            +
             
     | 
| 
      
 873 
     | 
    
         
            +
            void icopyvx_avx2(const ae_int_t n, const ae_int_t* __restrict x,
         
     | 
| 
      
 874 
     | 
    
         
            +
                            ae_int_t* __restrict y, ae_state* __restrict _state)
         
     | 
| 
      
 875 
     | 
    
         
            +
            {
         
     | 
| 
      
 876 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 877 
     | 
    
         
            +
                if( n<=8 )
         
     | 
| 
      
 878 
     | 
    
         
            +
                {
         
     | 
| 
      
 879 
     | 
    
         
            +
                    ae_int_t j;
         
     | 
| 
      
 880 
     | 
    
         
            +
                    for(j=0; j<=n-1; j++)
         
     | 
| 
      
 881 
     | 
    
         
            +
                        y[j] = x[j];
         
     | 
| 
      
 882 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 883 
     | 
    
         
            +
                }
         
     | 
| 
      
 884 
     | 
    
         
            +
                if(unal == 0)
         
     | 
| 
      
 885 
     | 
    
         
            +
                {
         
     | 
| 
      
 886 
     | 
    
         
            +
                    icopyvx_avx2_xaligned(n, x, y, _state);
         
     | 
| 
      
 887 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 888 
     | 
    
         
            +
                }
         
     | 
| 
      
 889 
     | 
    
         
            +
                const ae_int_t offset = 32-unal;
         
     | 
| 
      
 890 
     | 
    
         
            +
                memmove(y, x, offset);
         
     | 
| 
      
 891 
     | 
    
         
            +
                const ae_int_t nDone = offset / sizeof(ae_int_t);
         
     | 
| 
      
 892 
     | 
    
         
            +
                icopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
         
     | 
| 
      
 893 
     | 
    
         
            +
            }
         
     | 
| 
      
 894 
     | 
    
         
            +
             
     | 
| 
      
 895 
     | 
    
         
            +
            void rgemv_straight_avx2(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 896 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a,
         
     | 
| 
      
 897 
     | 
    
         
            +
                const double* __restrict x, double* __restrict y, ae_state* _state)
         
     | 
| 
      
 898 
     | 
    
         
            +
            {
         
     | 
| 
      
 899 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 900 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 901 
     | 
    
         
            +
                const __m256d* __restrict pX = (const __m256d*)x;
         
     | 
| 
      
 902 
     | 
    
         
            +
                const ae_int_t nVec = n >> 2;
         
     | 
| 
      
 903 
     | 
    
         
            +
                const ae_int_t nUnroll = nVec >> 3;
         
     | 
| 
      
 904 
     | 
    
         
            +
                __m256d sum = _mm256_setzero_pd();
         
     | 
| 
      
 905 
     | 
    
         
            +
                for(i=0; i<m; i++) {
         
     | 
| 
      
 906 
     | 
    
         
            +
                    const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
         
     | 
| 
      
 907 
     | 
    
         
            +
                    if(nUnroll >= 1) {
         
     | 
| 
      
 908 
     | 
    
         
            +
                        __m256d u0 = _mm256_mul_pd(pRow[0], pX[0]);
         
     | 
| 
      
 909 
     | 
    
         
            +
                        __m256d u1 = _mm256_mul_pd(pRow[1], pX[1]);
         
     | 
| 
      
 910 
     | 
    
         
            +
                        __m256d u2 = _mm256_mul_pd(pRow[2], pX[2]);
         
     | 
| 
      
 911 
     | 
    
         
            +
                        __m256d u3 = _mm256_mul_pd(pRow[3], pX[3]);
         
     | 
| 
      
 912 
     | 
    
         
            +
                        __m256d u4 = _mm256_mul_pd(pRow[4], pX[4]);
         
     | 
| 
      
 913 
     | 
    
         
            +
                        __m256d u5 = _mm256_mul_pd(pRow[5], pX[5]);
         
     | 
| 
      
 914 
     | 
    
         
            +
                        __m256d u6 = _mm256_mul_pd(pRow[6], pX[6]);
         
     | 
| 
      
 915 
     | 
    
         
            +
                        __m256d u7 = _mm256_mul_pd(pRow[7], pX[7]);
         
     | 
| 
      
 916 
     | 
    
         
            +
                        for(j=1; j<nUnroll; j++) {
         
     | 
| 
      
 917 
     | 
    
         
            +
                          const ae_int_t at = j<<3;
         
     | 
| 
      
 918 
     | 
    
         
            +
                          u0 = _mm256_add_pd(u0, _mm256_mul_pd(pRow[at], pX[at]));
         
     | 
| 
      
 919 
     | 
    
         
            +
                          u1 = _mm256_add_pd(u1, _mm256_mul_pd(pRow[at+1], pX[at+1]));
         
     | 
| 
      
 920 
     | 
    
         
            +
                          u2 = _mm256_add_pd(u2, _mm256_mul_pd(pRow[at+2], pX[at+2]));
         
     | 
| 
      
 921 
     | 
    
         
            +
                          u3 = _mm256_add_pd(u3, _mm256_mul_pd(pRow[at+3], pX[at+3]));
         
     | 
| 
      
 922 
     | 
    
         
            +
                          u4 = _mm256_add_pd(u4, _mm256_mul_pd(pRow[at+4], pX[at+4]));
         
     | 
| 
      
 923 
     | 
    
         
            +
                          u5 = _mm256_add_pd(u5, _mm256_mul_pd(pRow[at+5], pX[at+5]));
         
     | 
| 
      
 924 
     | 
    
         
            +
                          u6 = _mm256_add_pd(u6, _mm256_mul_pd(pRow[at+6], pX[at+6]));
         
     | 
| 
      
 925 
     | 
    
         
            +
                          u7 = _mm256_add_pd(u7, _mm256_mul_pd(pRow[at+7], pX[at+7]));
         
     | 
| 
      
 926 
     | 
    
         
            +
                        }
         
     | 
| 
      
 927 
     | 
    
         
            +
                        const ae_int_t at = j<<3;
         
     | 
| 
      
 928 
     | 
    
         
            +
                        switch(nVec-at) {
         
     | 
| 
      
 929 
     | 
    
         
            +
                        case 7:
         
     | 
| 
      
 930 
     | 
    
         
            +
                            u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], pRow[at+6]), u6);
         
     | 
| 
      
 931 
     | 
    
         
            +
                        case 6:
         
     | 
| 
      
 932 
     | 
    
         
            +
                            u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], pRow[at+5]), u5);
         
     | 
| 
      
 933 
     | 
    
         
            +
                        case 5:
         
     | 
| 
      
 934 
     | 
    
         
            +
                            u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], pRow[at+4]), u4);
         
     | 
| 
      
 935 
     | 
    
         
            +
                        case 4:
         
     | 
| 
      
 936 
     | 
    
         
            +
                            u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], pRow[at+3]), u3);
         
     | 
| 
      
 937 
     | 
    
         
            +
                        case 3:
         
     | 
| 
      
 938 
     | 
    
         
            +
                            u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], pRow[at+2]), u2);
         
     | 
| 
      
 939 
     | 
    
         
            +
                        case 2:
         
     | 
| 
      
 940 
     | 
    
         
            +
                            u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], pRow[at+1]), u1);
         
     | 
| 
      
 941 
     | 
    
         
            +
                        case 1:
         
     | 
| 
      
 942 
     | 
    
         
            +
                            u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], pRow[at+0]), u0);
         
     | 
| 
      
 943 
     | 
    
         
            +
                        }
         
     | 
| 
      
 944 
     | 
    
         
            +
                        sum = _mm256_add_pd(
         
     | 
| 
      
 945 
     | 
    
         
            +
                            _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
         
     | 
| 
      
 946 
     | 
    
         
            +
                            _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
         
     | 
| 
      
 947 
     | 
    
         
            +
                    }
         
     | 
| 
      
 948 
     | 
    
         
            +
                    else {
         
     | 
| 
      
 949 
     | 
    
         
            +
                        switch(nVec) {
         
     | 
| 
      
 950 
     | 
    
         
            +
                        case 0:
         
     | 
| 
      
 951 
     | 
    
         
            +
                            sum = _mm256_setzero_pd();
         
     | 
| 
      
 952 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 953 
     | 
    
         
            +
                        case 1:
         
     | 
| 
      
 954 
     | 
    
         
            +
                            sum = _mm256_mul_pd(pX[0], pRow[0]);
         
     | 
| 
      
 955 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 956 
     | 
    
         
            +
                        case 2:
         
     | 
| 
      
 957 
     | 
    
         
            +
                            sum = _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1]));
         
     | 
| 
      
 958 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 959 
     | 
    
         
            +
                        case 3:
         
     | 
| 
      
 960 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 961 
     | 
    
         
            +
                                _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
         
     | 
| 
      
 962 
     | 
    
         
            +
                                _mm256_mul_pd(pX[2], pRow[2]));
         
     | 
| 
      
 963 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 964 
     | 
    
         
            +
                        case 4:
         
     | 
| 
      
 965 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 966 
     | 
    
         
            +
                                _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
         
     | 
| 
      
 967 
     | 
    
         
            +
                                _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3])));
         
     | 
| 
      
 968 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 969 
     | 
    
         
            +
                        case 5:
         
     | 
| 
      
 970 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 971 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 972 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
         
     | 
| 
      
 973 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
         
     | 
| 
      
 974 
     | 
    
         
            +
                                _mm256_mul_pd(pX[4], pRow[4]));
         
     | 
| 
      
 975 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 976 
     | 
    
         
            +
                        case 6:
         
     | 
| 
      
 977 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 978 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 979 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
         
     | 
| 
      
 980 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
         
     | 
| 
      
 981 
     | 
    
         
            +
                                _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])));
         
     | 
| 
      
 982 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 983 
     | 
    
         
            +
                        case 7:
         
     | 
| 
      
 984 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 985 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 986 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
         
     | 
| 
      
 987 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
         
     | 
| 
      
 988 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 989 
     | 
    
         
            +
                                    _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])),
         
     | 
| 
      
 990 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[6], pRow[6])));
         
     | 
| 
      
 991 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 992 
     | 
    
         
            +
                        }
         
     | 
| 
      
 993 
     | 
    
         
            +
                    }
         
     | 
| 
      
 994 
     | 
    
         
            +
                    const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
         
     | 
| 
      
 995 
     | 
    
         
            +
                    const double* pComps = (const double*)&t;
         
     | 
| 
      
 996 
     | 
    
         
            +
                    double ans = pComps[0] + pComps[1];
         
     | 
| 
      
 997 
     | 
    
         
            +
                    const ae_int_t tail = nVec<<2;
         
     | 
| 
      
 998 
     | 
    
         
            +
                    for(j=tail; j<n; j++) {
         
     | 
| 
      
 999 
     | 
    
         
            +
                      ans += a->ptr.pp_double[i][j] * x[j];
         
     | 
| 
      
 1000 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1001 
     | 
    
         
            +
                    y[i] += alpha*ans;
         
     | 
| 
      
 1002 
     | 
    
         
            +
                }
         
     | 
| 
      
 1003 
     | 
    
         
            +
            }
         
     | 
| 
      
 1004 
     | 
    
         
            +
             
     | 
| 
      
 1005 
     | 
    
         
            +
            void rgemv_transposed_avx2(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 1006 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a,
         
     | 
| 
      
 1007 
     | 
    
         
            +
                const double* __restrict x, double* __restrict y, ae_state* _state)
         
     | 
| 
      
 1008 
     | 
    
         
            +
            {
         
     | 
| 
      
 1009 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1010 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 1011 
     | 
    
         
            +
                __m256d* __restrict pY = (__m256d*)y;
         
     | 
| 
      
 1012 
     | 
    
         
            +
                const ae_int_t nVec = m >> 2;
         
     | 
| 
      
 1013 
     | 
    
         
            +
             
     | 
| 
      
 1014 
     | 
    
         
            +
                for(i=0; i<=n-1; i++)
         
     | 
| 
      
 1015 
     | 
    
         
            +
                {
         
     | 
| 
      
 1016 
     | 
    
         
            +
                    const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
         
     | 
| 
      
 1017 
     | 
    
         
            +
                    const double v = alpha*x[i];
         
     | 
| 
      
 1018 
     | 
    
         
            +
                    const __m256d vV = _mm256_set1_pd(v);
         
     | 
| 
      
 1019 
     | 
    
         
            +
                    for(j=0; j<nVec; j++)
         
     | 
| 
      
 1020 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1021 
     | 
    
         
            +
                        pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, pRow[j]), pY[j]);
         
     | 
| 
      
 1022 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1023 
     | 
    
         
            +
                    const ae_int_t tail = nVec<<2;
         
     | 
| 
      
 1024 
     | 
    
         
            +
                    for(j=tail; j<m; j++) {
         
     | 
| 
      
 1025 
     | 
    
         
            +
                        y[j] += v*a->ptr.pp_double[i][j];
         
     | 
| 
      
 1026 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1027 
     | 
    
         
            +
                }
         
     | 
| 
      
 1028 
     | 
    
         
            +
            }
         
     | 
| 
      
 1029 
     | 
    
         
            +
             
     | 
| 
      
 1030 
     | 
    
         
            +
            void rgemvx_straight_avx2_xaligned(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 1031 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
         
     | 
| 
      
 1032 
     | 
    
         
            +
                const ae_int_t ja, const double* __restrict x,
         
     | 
| 
      
 1033 
     | 
    
         
            +
                double* __restrict y, ae_state* _state)
         
     | 
| 
      
 1034 
     | 
    
         
            +
            {
         
     | 
| 
      
 1035 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1036 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 1037 
     | 
    
         
            +
                const __m256d* __restrict pX = (const __m256d*)x;
         
     | 
| 
      
 1038 
     | 
    
         
            +
                const ae_int_t nVec = n >> 2;
         
     | 
| 
      
 1039 
     | 
    
         
            +
                const ae_int_t nUnroll = nVec >> 3;
         
     | 
| 
      
 1040 
     | 
    
         
            +
                __m256d sum = _mm256_setzero_pd();
         
     | 
| 
      
 1041 
     | 
    
         
            +
                for(i=0; i<m; i++) {
         
     | 
| 
      
 1042 
     | 
    
         
            +
                    const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
         
     | 
| 
      
 1043 
     | 
    
         
            +
                    if(nUnroll >= 1) {
         
     | 
| 
      
 1044 
     | 
    
         
            +
                        __m256d u0 = _mm256_mul_pd(ULOAD256PD(pRow[0]), pX[0]);
         
     | 
| 
      
 1045 
     | 
    
         
            +
                        __m256d u1 = _mm256_mul_pd(ULOAD256PD(pRow[1]), pX[1]);
         
     | 
| 
      
 1046 
     | 
    
         
            +
                        __m256d u2 = _mm256_mul_pd(ULOAD256PD(pRow[2]), pX[2]);
         
     | 
| 
      
 1047 
     | 
    
         
            +
                        __m256d u3 = _mm256_mul_pd(ULOAD256PD(pRow[3]), pX[3]);
         
     | 
| 
      
 1048 
     | 
    
         
            +
                        __m256d u4 = _mm256_mul_pd(ULOAD256PD(pRow[4]), pX[4]);
         
     | 
| 
      
 1049 
     | 
    
         
            +
                        __m256d u5 = _mm256_mul_pd(ULOAD256PD(pRow[5]), pX[5]);
         
     | 
| 
      
 1050 
     | 
    
         
            +
                        __m256d u6 = _mm256_mul_pd(ULOAD256PD(pRow[6]), pX[6]);
         
     | 
| 
      
 1051 
     | 
    
         
            +
                        __m256d u7 = _mm256_mul_pd(ULOAD256PD(pRow[7]), pX[7]);
         
     | 
| 
      
 1052 
     | 
    
         
            +
                        for(j=1; j<nUnroll; j++) {
         
     | 
| 
      
 1053 
     | 
    
         
            +
                          const ae_int_t at = j<<3;
         
     | 
| 
      
 1054 
     | 
    
         
            +
                          u0 = _mm256_add_pd(u0, _mm256_mul_pd(ULOAD256PD(pRow[at]), pX[at]));
         
     | 
| 
      
 1055 
     | 
    
         
            +
                          u1 = _mm256_add_pd(u1, _mm256_mul_pd(ULOAD256PD(pRow[at+1]), pX[at+1]));
         
     | 
| 
      
 1056 
     | 
    
         
            +
                          u2 = _mm256_add_pd(u2, _mm256_mul_pd(ULOAD256PD(pRow[at+2]), pX[at+2]));
         
     | 
| 
      
 1057 
     | 
    
         
            +
                          u3 = _mm256_add_pd(u3, _mm256_mul_pd(ULOAD256PD(pRow[at+3]), pX[at+3]));
         
     | 
| 
      
 1058 
     | 
    
         
            +
                          u4 = _mm256_add_pd(u4, _mm256_mul_pd(ULOAD256PD(pRow[at+4]), pX[at+4]));
         
     | 
| 
      
 1059 
     | 
    
         
            +
                          u5 = _mm256_add_pd(u5, _mm256_mul_pd(ULOAD256PD(pRow[at+5]), pX[at+5]));
         
     | 
| 
      
 1060 
     | 
    
         
            +
                          u6 = _mm256_add_pd(u6, _mm256_mul_pd(ULOAD256PD(pRow[at+6]), pX[at+6]));
         
     | 
| 
      
 1061 
     | 
    
         
            +
                          u7 = _mm256_add_pd(u7, _mm256_mul_pd(ULOAD256PD(pRow[at+7]), pX[at+7]));
         
     | 
| 
      
 1062 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1063 
     | 
    
         
            +
                        const ae_int_t at = j<<3;
         
     | 
| 
      
 1064 
     | 
    
         
            +
                        switch(nVec-at) {
         
     | 
| 
      
 1065 
     | 
    
         
            +
                        case 7:
         
     | 
| 
      
 1066 
     | 
    
         
            +
                            u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], ULOAD256PD(pRow[at+6])), u6);
         
     | 
| 
      
 1067 
     | 
    
         
            +
                        case 6:
         
     | 
| 
      
 1068 
     | 
    
         
            +
                            u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], ULOAD256PD(pRow[at+5])), u5);
         
     | 
| 
      
 1069 
     | 
    
         
            +
                        case 5:
         
     | 
| 
      
 1070 
     | 
    
         
            +
                            u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], ULOAD256PD(pRow[at+4])), u4);
         
     | 
| 
      
 1071 
     | 
    
         
            +
                        case 4:
         
     | 
| 
      
 1072 
     | 
    
         
            +
                            u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], ULOAD256PD(pRow[at+3])), u3);
         
     | 
| 
      
 1073 
     | 
    
         
            +
                        case 3:
         
     | 
| 
      
 1074 
     | 
    
         
            +
                            u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], ULOAD256PD(pRow[at+2])), u2);
         
     | 
| 
      
 1075 
     | 
    
         
            +
                        case 2:
         
     | 
| 
      
 1076 
     | 
    
         
            +
                            u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], ULOAD256PD(pRow[at+1])), u1);
         
     | 
| 
      
 1077 
     | 
    
         
            +
                        case 1:
         
     | 
| 
      
 1078 
     | 
    
         
            +
                            u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], ULOAD256PD(pRow[at+0])), u0);
         
     | 
| 
      
 1079 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1080 
     | 
    
         
            +
                        sum = _mm256_add_pd(
         
     | 
| 
      
 1081 
     | 
    
         
            +
                            _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
         
     | 
| 
      
 1082 
     | 
    
         
            +
                            _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
         
     | 
| 
      
 1083 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1084 
     | 
    
         
            +
                    else {
         
     | 
| 
      
 1085 
     | 
    
         
            +
                        switch(nVec) {
         
     | 
| 
      
 1086 
     | 
    
         
            +
                        case 0:
         
     | 
| 
      
 1087 
     | 
    
         
            +
                            sum = _mm256_setzero_pd();
         
     | 
| 
      
 1088 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1089 
     | 
    
         
            +
                        case 1:
         
     | 
| 
      
 1090 
     | 
    
         
            +
                            sum = _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0]));
         
     | 
| 
      
 1091 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1092 
     | 
    
         
            +
                        case 2:
         
     | 
| 
      
 1093 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1094 
     | 
    
         
            +
                                _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1095 
     | 
    
         
            +
                                _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1])));
         
     | 
| 
      
 1096 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1097 
     | 
    
         
            +
                        case 3:
         
     | 
| 
      
 1098 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1099 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1100 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1101 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
         
     | 
| 
      
 1102 
     | 
    
         
            +
                                _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])));
         
     | 
| 
      
 1103 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1104 
     | 
    
         
            +
                        case 4:
         
     | 
| 
      
 1105 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1106 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1107 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1108 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
         
     | 
| 
      
 1109 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1110 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
         
     | 
| 
      
 1111 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3]))));
         
     | 
| 
      
 1112 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1113 
     | 
    
         
            +
                        case 5:
         
     | 
| 
      
 1114 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1115 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1116 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1117 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1118 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
         
     | 
| 
      
 1119 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1120 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
         
     | 
| 
      
 1121 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
         
     | 
| 
      
 1122 
     | 
    
         
            +
                                _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])));
         
     | 
| 
      
 1123 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1124 
     | 
    
         
            +
                        case 6:
         
     | 
| 
      
 1125 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1126 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1127 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1128 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1129 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
         
     | 
| 
      
 1130 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1131 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
         
     | 
| 
      
 1132 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
         
     | 
| 
      
 1133 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1134 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
         
     | 
| 
      
 1135 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))));
         
     | 
| 
      
 1136 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1137 
     | 
    
         
            +
                        case 7:
         
     | 
| 
      
 1138 
     | 
    
         
            +
                            sum = _mm256_add_pd(
         
     | 
| 
      
 1139 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1140 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1141 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
         
     | 
| 
      
 1142 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
         
     | 
| 
      
 1143 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1144 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
         
     | 
| 
      
 1145 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
         
     | 
| 
      
 1146 
     | 
    
         
            +
                                _mm256_add_pd(
         
     | 
| 
      
 1147 
     | 
    
         
            +
                                    _mm256_add_pd(
         
     | 
| 
      
 1148 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
         
     | 
| 
      
 1149 
     | 
    
         
            +
                                        _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))),
         
     | 
| 
      
 1150 
     | 
    
         
            +
                                    _mm256_mul_pd(pX[6], ULOAD256PD(pRow[6]))));
         
     | 
| 
      
 1151 
     | 
    
         
            +
                            break;
         
     | 
| 
      
 1152 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1153 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1154 
     | 
    
         
            +
                    const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
         
     | 
| 
      
 1155 
     | 
    
         
            +
                    const double* pComps = (const double*)&t;
         
     | 
| 
      
 1156 
     | 
    
         
            +
                    double ans = pComps[0] + pComps[1];
         
     | 
| 
      
 1157 
     | 
    
         
            +
                    const ae_int_t tail = nVec<<2;
         
     | 
| 
      
 1158 
     | 
    
         
            +
                    for(j=tail; j<n; j++) {
         
     | 
| 
      
 1159 
     | 
    
         
            +
                      ans += a->ptr.pp_double[i+ia][j+ja] * x[j];
         
     | 
| 
      
 1160 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1161 
     | 
    
         
            +
                    y[i] += alpha*ans;
         
     | 
| 
      
 1162 
     | 
    
         
            +
                }
         
     | 
| 
      
 1163 
     | 
    
         
            +
            }
         
     | 
| 
      
 1164 
     | 
    
         
            +
             
     | 
| 
      
 1165 
     | 
    
         
            +
            void rgemvx_straight_avx2(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 1166 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
         
     | 
| 
      
 1167 
     | 
    
         
            +
                const ae_int_t ja, const double* __restrict x,
         
     | 
| 
      
 1168 
     | 
    
         
            +
                double* __restrict y, ae_state* _state)
         
     | 
| 
      
 1169 
     | 
    
         
            +
            {
         
     | 
| 
      
 1170 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1171 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 1172 
     | 
    
         
            +
                if( n<=3 ) {
         
     | 
| 
      
 1173 
     | 
    
         
            +
                    for(i=0; i<m; i++) {
         
     | 
| 
      
 1174 
     | 
    
         
            +
                        const double *p_a = a->ptr.pp_double[ia+i]+ja;
         
     | 
| 
      
 1175 
     | 
    
         
            +
                        double v = 0.0;
         
     | 
| 
      
 1176 
     | 
    
         
            +
                        for(j=0; j<n; j++) {
         
     | 
| 
      
 1177 
     | 
    
         
            +
                            v += p_a[j] * x[j];
         
     | 
| 
      
 1178 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1179 
     | 
    
         
            +
                        y[i] += alpha*v;
         
     | 
| 
      
 1180 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1181 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 1182 
     | 
    
         
            +
                }
         
     | 
| 
      
 1183 
     | 
    
         
            +
             
     | 
| 
      
 1184 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
         
     | 
| 
      
 1185 
     | 
    
         
            +
                if(unal == 0)
         
     | 
| 
      
 1186 
     | 
    
         
            +
                {
         
     | 
| 
      
 1187 
     | 
    
         
            +
                    rgemvx_straight_avx2_xaligned(m, n, alpha, a, ia, ja, x, y, _state);
         
     | 
| 
      
 1188 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 1189 
     | 
    
         
            +
                }
         
     | 
| 
      
 1190 
     | 
    
         
            +
                const ptrdiff_t shift = 4-(unal>>3);
         
     | 
| 
      
 1191 
     | 
    
         
            +
                for(i=0; i<m; i++) {
         
     | 
| 
      
 1192 
     | 
    
         
            +
                    const double *p_a = a->ptr.pp_double[ia+i]+ja;
         
     | 
| 
      
 1193 
     | 
    
         
            +
                    double v = 0.0;
         
     | 
| 
      
 1194 
     | 
    
         
            +
                    for(j=0; j<shift; j++) {
         
     | 
| 
      
 1195 
     | 
    
         
            +
                        v += p_a[j] * x[j];
         
     | 
| 
      
 1196 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1197 
     | 
    
         
            +
                    y[i] += alpha*v;
         
     | 
| 
      
 1198 
     | 
    
         
            +
                }
         
     | 
| 
      
 1199 
     | 
    
         
            +
                rgemvx_straight_avx2_xaligned(m, n-shift, alpha, a, ia, ja+shift, x+shift, y, _state);
         
     | 
| 
      
 1200 
     | 
    
         
            +
            }
         
     | 
| 
      
 1201 
     | 
    
         
            +
             
     | 
| 
      
 1202 
     | 
    
         
            +
            void rgemvx_transposed_avx2_yaligned(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 1203 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
         
     | 
| 
      
 1204 
     | 
    
         
            +
                const ae_int_t ja, const double* __restrict x, double* __restrict y,
         
     | 
| 
      
 1205 
     | 
    
         
            +
                ae_state* _state)
         
     | 
| 
      
 1206 
     | 
    
         
            +
            {
         
     | 
| 
      
 1207 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1208 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 1209 
     | 
    
         
            +
                __m256d* __restrict pY = (__m256d*)y;
         
     | 
| 
      
 1210 
     | 
    
         
            +
                const ae_int_t nVec = m >> 2;
         
     | 
| 
      
 1211 
     | 
    
         
            +
             
     | 
| 
      
 1212 
     | 
    
         
            +
                for(i=0; i<=n-1; i++)
         
     | 
| 
      
 1213 
     | 
    
         
            +
                {
         
     | 
| 
      
 1214 
     | 
    
         
            +
                    const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
         
     | 
| 
      
 1215 
     | 
    
         
            +
                    const double v = alpha*x[i];
         
     | 
| 
      
 1216 
     | 
    
         
            +
                    const __m256d vV = _mm256_set1_pd(v);
         
     | 
| 
      
 1217 
     | 
    
         
            +
                    for(j=0; j<nVec; j++)
         
     | 
| 
      
 1218 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1219 
     | 
    
         
            +
                        pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, ULOAD256PD(pRow[j])), pY[j]);
         
     | 
| 
      
 1220 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1221 
     | 
    
         
            +
                    const ae_int_t tail = nVec<<2;
         
     | 
| 
      
 1222 
     | 
    
         
            +
                    for(j=tail; j<m; j++) {
         
     | 
| 
      
 1223 
     | 
    
         
            +
                        y[j] += v*a->ptr.pp_double[i+ia][j+ja];
         
     | 
| 
      
 1224 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1225 
     | 
    
         
            +
                }
         
     | 
| 
      
 1226 
     | 
    
         
            +
            }
         
     | 
| 
      
 1227 
     | 
    
         
            +
             
     | 
| 
      
 1228 
     | 
    
         
            +
            void rgemvx_transposed_avx2(const ae_int_t m, const ae_int_t n,
         
     | 
| 
      
 1229 
     | 
    
         
            +
                const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
         
     | 
| 
      
 1230 
     | 
    
         
            +
                const ae_int_t ja, const double* __restrict x, double* __restrict y,
         
     | 
| 
      
 1231 
     | 
    
         
            +
                ae_state* _state)
         
     | 
| 
      
 1232 
     | 
    
         
            +
            {
         
     | 
| 
      
 1233 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1234 
     | 
    
         
            +
                ae_int_t j;
         
     | 
| 
      
 1235 
     | 
    
         
            +
                if( m<=3 ) {
         
     | 
| 
      
 1236 
     | 
    
         
            +
                    for(i=0; i<n; i++) {
         
     | 
| 
      
 1237 
     | 
    
         
            +
                        const double *p_a = a->ptr.pp_double[ia+i]+ja;
         
     | 
| 
      
 1238 
     | 
    
         
            +
                        const double v = alpha*x[i];
         
     | 
| 
      
 1239 
     | 
    
         
            +
                        for(j=0; j<m; j++) {
         
     | 
| 
      
 1240 
     | 
    
         
            +
                            y[j] += v*p_a[j];
         
     | 
| 
      
 1241 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1242 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1243 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 1244 
     | 
    
         
            +
                }
         
     | 
| 
      
 1245 
     | 
    
         
            +
             
     | 
| 
      
 1246 
     | 
    
         
            +
                const ptrdiff_t unal = ((ptrdiff_t)y) & 31;
         
     | 
| 
      
 1247 
     | 
    
         
            +
                if(unal == 0)
         
     | 
| 
      
 1248 
     | 
    
         
            +
                {
         
     | 
| 
      
 1249 
     | 
    
         
            +
                    rgemvx_transposed_avx2_yaligned(m, n, alpha, a, ia, ja, x, y, _state);
         
     | 
| 
      
 1250 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 1251 
     | 
    
         
            +
                }
         
     | 
| 
      
 1252 
     | 
    
         
            +
                const ptrdiff_t shift = 4-(unal>>3);
         
     | 
| 
      
 1253 
     | 
    
         
            +
                for(i=0; i<n; i++) {
         
     | 
| 
      
 1254 
     | 
    
         
            +
                    const double *p_a = a->ptr.pp_double[ia+i]+ja;
         
     | 
| 
      
 1255 
     | 
    
         
            +
                    const double v = alpha*x[i];
         
     | 
| 
      
 1256 
     | 
    
         
            +
                    for(j=0; j<shift; j++) {
         
     | 
| 
      
 1257 
     | 
    
         
            +
                        y[j] += v*p_a[j];
         
     | 
| 
      
 1258 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1259 
     | 
    
         
            +
                }
         
     | 
| 
      
 1260 
     | 
    
         
            +
                rgemvx_transposed_avx2_yaligned(m-shift, n, alpha, a, ia, ja+shift, x, y+shift, _state);
         
     | 
| 
      
 1261 
     | 
    
         
            +
            }
         
     | 
| 
      
 1262 
     | 
    
         
            +
             
     | 
| 
      
 1263 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 1264 
     | 
    
         
            +
            Block packing function for fast rGEMM. Loads long  WIDTH*LENGTH  submatrix
         
     | 
| 
      
 1265 
     | 
    
         
            +
            with LENGTH<=BLOCK_SIZE and WIDTH<=MICRO_SIZE into contiguous  MICRO_SIZE*
         
     | 
| 
      
 1266 
     | 
    
         
            +
            BLOCK_SIZE row-wise 'horizontal' storage (hence H in the function name).
         
     | 
| 
      
 1267 
     | 
    
         
            +
             
     | 
| 
      
 1268 
     | 
    
         
            +
            The matrix occupies first ROUND_LENGTH cols of the  storage  (with  LENGTH
         
     | 
| 
      
 1269 
     | 
    
         
            +
            being rounded up to nearest SIMD granularity).  ROUND_LENGTH  is  returned
         
     | 
| 
      
 1270 
     | 
    
         
            +
            as result. It is guaranteed that ROUND_LENGTH depends only on LENGTH,  and
         
     | 
| 
      
 1271 
     | 
    
         
            +
            that it will be same for all function calls.
         
     | 
| 
      
 1272 
     | 
    
         
            +
             
     | 
| 
      
 1273 
     | 
    
         
            +
            Unused rows and columns in [LENGTH,ROUND_LENGTH) range are filled by zeros;
         
     | 
| 
      
 1274 
     | 
    
         
            +
            unused cols in [ROUND_LENGTH,BLOCK_SIZE) range are ignored.
         
     | 
| 
      
 1275 
     | 
    
         
            +
             
     | 
| 
      
 1276 
     | 
    
         
            +
            * op=0 means that source is an WIDTH*LENGTH matrix stored with  src_stride
         
     | 
| 
      
 1277 
     | 
    
         
            +
              stride. The matrix is NOT transposed on load.
         
     | 
| 
      
 1278 
     | 
    
         
            +
            * op=1 means that source is an LENGTH*WIDTH matrix  stored with src_stride
         
     | 
| 
      
 1279 
     | 
    
         
            +
              that is loaded with transposition
         
     | 
| 
      
 1280 
     | 
    
         
            +
            * present version of the function supports only MICRO_SIZE=2, the behavior
         
     | 
| 
      
 1281 
     | 
    
         
            +
              is undefined for other micro sizes.
         
     | 
| 
      
 1282 
     | 
    
         
            +
            * the target is properly aligned; the source can be unaligned.
         
     | 
| 
      
 1283 
     | 
    
         
            +
             
     | 
| 
      
 1284 
     | 
    
         
            +
            Requires AVX2, does NOT check its presense.
         
     | 
| 
      
 1285 
     | 
    
         
            +
             
     | 
| 
      
 1286 
     | 
    
         
            +
            The function is present in two versions, one  with  variable  opsrc_length
         
     | 
| 
      
 1287 
     | 
    
         
            +
            and another one with opsrc_length==block_size==32.
         
     | 
| 
      
 1288 
     | 
    
         
            +
             
     | 
| 
      
 1289 
     | 
    
         
            +
              -- ALGLIB routine --
         
     | 
| 
      
 1290 
     | 
    
         
            +
                 19.07.2021
         
     | 
| 
      
 1291 
     | 
    
         
            +
                 Bochkanov Sergey
         
     | 
| 
      
 1292 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 1293 
     | 
    
         
            +
            ae_int_t ablasf_packblkh_avx2(
         
     | 
| 
      
 1294 
     | 
    
         
            +
                const double *src,
         
     | 
| 
      
 1295 
     | 
    
         
            +
                ae_int_t src_stride,
         
     | 
| 
      
 1296 
     | 
    
         
            +
                ae_int_t op,
         
     | 
| 
      
 1297 
     | 
    
         
            +
                ae_int_t opsrc_length,
         
     | 
| 
      
 1298 
     | 
    
         
            +
                ae_int_t opsrc_width,
         
     | 
| 
      
 1299 
     | 
    
         
            +
                double   *dst,
         
     | 
| 
      
 1300 
     | 
    
         
            +
                ae_int_t block_size,
         
     | 
| 
      
 1301 
     | 
    
         
            +
                ae_int_t micro_size)
         
     | 
| 
      
 1302 
     | 
    
         
            +
            {
         
     | 
| 
      
 1303 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1304 
     | 
    
         
            +
                
         
     | 
| 
      
 1305 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1306 
     | 
    
         
            +
                 * Write to the storage
         
     | 
| 
      
 1307 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1308 
     | 
    
         
            +
                if( op==0 )
         
     | 
| 
      
 1309 
     | 
    
         
            +
                {
         
     | 
| 
      
 1310 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1311 
     | 
    
         
            +
                     * Copy without transposition
         
     | 
| 
      
 1312 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1313 
     | 
    
         
            +
                    const ae_int_t len8=(opsrc_length>>3)<<3;
         
     | 
| 
      
 1314 
     | 
    
         
            +
                    const double *src1 = src+src_stride;
         
     | 
| 
      
 1315 
     | 
    
         
            +
                    double       *dst1 = dst+block_size;
         
     | 
| 
      
 1316 
     | 
    
         
            +
                    if( opsrc_width==2 )
         
     | 
| 
      
 1317 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1318 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1319 
     | 
    
         
            +
                         * Width=2
         
     | 
| 
      
 1320 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1321 
     | 
    
         
            +
                        for(i=0; i<len8; i+=8)
         
     | 
| 
      
 1322 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1323 
     | 
    
         
            +
                            _mm256_store_pd(dst+i,    _mm256_loadu_pd(src+i));
         
     | 
| 
      
 1324 
     | 
    
         
            +
                            _mm256_store_pd(dst+i+4,  _mm256_loadu_pd(src+i+4));
         
     | 
| 
      
 1325 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i,   _mm256_loadu_pd(src1+i));
         
     | 
| 
      
 1326 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
         
     | 
| 
      
 1327 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1328 
     | 
    
         
            +
                        for(i=len8; i<opsrc_length; i++)
         
     | 
| 
      
 1329 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1330 
     | 
    
         
            +
                            dst[i]  = src[i];
         
     | 
| 
      
 1331 
     | 
    
         
            +
                            dst1[i] = src1[i];
         
     | 
| 
      
 1332 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1333 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1334 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1335 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1336 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1337 
     | 
    
         
            +
                         * Width=1, pad by zeros
         
     | 
| 
      
 1338 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1339 
     | 
    
         
            +
                        __m256d vz = _mm256_setzero_pd();
         
     | 
| 
      
 1340 
     | 
    
         
            +
                        for(i=0; i<len8; i+=8)
         
     | 
| 
      
 1341 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1342 
     | 
    
         
            +
                            _mm256_store_pd(dst+i,    _mm256_loadu_pd(src+i));
         
     | 
| 
      
 1343 
     | 
    
         
            +
                            _mm256_store_pd(dst+i+4,  _mm256_loadu_pd(src+i+4));
         
     | 
| 
      
 1344 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i,   vz);
         
     | 
| 
      
 1345 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i+4, vz);
         
     | 
| 
      
 1346 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1347 
     | 
    
         
            +
                        for(i=len8; i<opsrc_length; i++)
         
     | 
| 
      
 1348 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1349 
     | 
    
         
            +
                            dst[i]  = src[i];
         
     | 
| 
      
 1350 
     | 
    
         
            +
                            dst1[i] = 0.0;
         
     | 
| 
      
 1351 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1352 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1353 
     | 
    
         
            +
                }
         
     | 
| 
      
 1354 
     | 
    
         
            +
                else
         
     | 
| 
      
 1355 
     | 
    
         
            +
                {
         
     | 
| 
      
 1356 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1357 
     | 
    
         
            +
                     * Copy with transposition
         
     | 
| 
      
 1358 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1359 
     | 
    
         
            +
                    const ae_int_t stride2 = src_stride<<1;
         
     | 
| 
      
 1360 
     | 
    
         
            +
                    const ae_int_t stride3 = src_stride+stride2;
         
     | 
| 
      
 1361 
     | 
    
         
            +
                    const ae_int_t stride4 = src_stride<<2;
         
     | 
| 
      
 1362 
     | 
    
         
            +
                    const ae_int_t len4=(opsrc_length>>2)<<2;
         
     | 
| 
      
 1363 
     | 
    
         
            +
                    const double *srci = src;
         
     | 
| 
      
 1364 
     | 
    
         
            +
                    double       *dst1 = dst+block_size;
         
     | 
| 
      
 1365 
     | 
    
         
            +
                    if( opsrc_width==2 )
         
     | 
| 
      
 1366 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1367 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1368 
     | 
    
         
            +
                         * Width=2
         
     | 
| 
      
 1369 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1370 
     | 
    
         
            +
                        for(i=0; i<len4; i+=4)
         
     | 
| 
      
 1371 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1372 
     | 
    
         
            +
                            __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
         
     | 
| 
      
 1373 
     | 
    
         
            +
                            __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
         
     | 
| 
      
 1374 
     | 
    
         
            +
                            _mm_store_pd(dst+i,    _mm_unpacklo_pd(s0,s1));
         
     | 
| 
      
 1375 
     | 
    
         
            +
                            _mm_store_pd(dst1+i,   _mm_unpackhi_pd(s0,s1));
         
     | 
| 
      
 1376 
     | 
    
         
            +
                            _mm_store_pd(dst+i+2,  _mm_unpacklo_pd(s2,s3));
         
     | 
| 
      
 1377 
     | 
    
         
            +
                            _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
         
     | 
| 
      
 1378 
     | 
    
         
            +
                            srci += stride4;
         
     | 
| 
      
 1379 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1380 
     | 
    
         
            +
                        for(i=len4; i<opsrc_length; i++)
         
     | 
| 
      
 1381 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1382 
     | 
    
         
            +
                            dst[i]  = srci[0];
         
     | 
| 
      
 1383 
     | 
    
         
            +
                            dst1[i] = srci[1];
         
     | 
| 
      
 1384 
     | 
    
         
            +
                            srci += src_stride;
         
     | 
| 
      
 1385 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1386 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1387 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1388 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1389 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1390 
     | 
    
         
            +
                         * Width=1, pad by zeros
         
     | 
| 
      
 1391 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1392 
     | 
    
         
            +
                        __m128d vz = _mm_setzero_pd();
         
     | 
| 
      
 1393 
     | 
    
         
            +
                        for(i=0; i<len4; i+=4)
         
     | 
| 
      
 1394 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1395 
     | 
    
         
            +
                            __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
         
     | 
| 
      
 1396 
     | 
    
         
            +
                            __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
         
     | 
| 
      
 1397 
     | 
    
         
            +
                            _mm_store_pd(dst+i,    _mm_unpacklo_pd(s0,s1));
         
     | 
| 
      
 1398 
     | 
    
         
            +
                            _mm_store_pd(dst+i+2,  _mm_unpacklo_pd(s2,s3));
         
     | 
| 
      
 1399 
     | 
    
         
            +
                            _mm_store_pd(dst1+i,   vz);
         
     | 
| 
      
 1400 
     | 
    
         
            +
                            _mm_store_pd(dst1+i+2, vz);
         
     | 
| 
      
 1401 
     | 
    
         
            +
                            srci += stride4;
         
     | 
| 
      
 1402 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1403 
     | 
    
         
            +
                        for(i=len4; i<opsrc_length; i++)
         
     | 
| 
      
 1404 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1405 
     | 
    
         
            +
                            dst[i]  = srci[0];
         
     | 
| 
      
 1406 
     | 
    
         
            +
                            dst1[i] = 0.0;
         
     | 
| 
      
 1407 
     | 
    
         
            +
                            srci += src_stride;
         
     | 
| 
      
 1408 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1409 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1410 
     | 
    
         
            +
                }
         
     | 
| 
      
 1411 
     | 
    
         
            +
                
         
     | 
| 
      
 1412 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1413 
     | 
    
         
            +
                 * Pad by zeros, if needed
         
     | 
| 
      
 1414 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1415 
     | 
    
         
            +
                ae_int_t round_length = ((opsrc_length+3)>>2)<<2;
         
     | 
| 
      
 1416 
     | 
    
         
            +
                for(i=opsrc_length; i<round_length; i++)
         
     | 
| 
      
 1417 
     | 
    
         
            +
                {
         
     | 
| 
      
 1418 
     | 
    
         
            +
                    dst[i] = 0;
         
     | 
| 
      
 1419 
     | 
    
         
            +
                    dst[i+block_size] = 0;
         
     | 
| 
      
 1420 
     | 
    
         
            +
                }
         
     | 
| 
      
 1421 
     | 
    
         
            +
                return round_length;
         
     | 
| 
      
 1422 
     | 
    
         
            +
            }
         
     | 
| 
      
 1423 
     | 
    
         
            +
             
     | 
| 
      
 1424 
     | 
    
         
            +
            ae_int_t ablasf_packblkh32_avx2(
         
     | 
| 
      
 1425 
     | 
    
         
            +
                const double *src,
         
     | 
| 
      
 1426 
     | 
    
         
            +
                ae_int_t src_stride,
         
     | 
| 
      
 1427 
     | 
    
         
            +
                ae_int_t op,
         
     | 
| 
      
 1428 
     | 
    
         
            +
                ae_int_t ignore_opsrc_length,
         
     | 
| 
      
 1429 
     | 
    
         
            +
                ae_int_t opsrc_width,
         
     | 
| 
      
 1430 
     | 
    
         
            +
                double   *dst,
         
     | 
| 
      
 1431 
     | 
    
         
            +
                ae_int_t ignore_block_size,
         
     | 
| 
      
 1432 
     | 
    
         
            +
                ae_int_t micro_size)
         
     | 
| 
      
 1433 
     | 
    
         
            +
            {
         
     | 
| 
      
 1434 
     | 
    
         
            +
                ae_int_t i;
         
     | 
| 
      
 1435 
     | 
    
         
            +
                
         
     | 
| 
      
 1436 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1437 
     | 
    
         
            +
                 * Write to the storage
         
     | 
| 
      
 1438 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1439 
     | 
    
         
            +
                if( op==0 )
         
     | 
| 
      
 1440 
     | 
    
         
            +
                {
         
     | 
| 
      
 1441 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1442 
     | 
    
         
            +
                     * Copy without transposition
         
     | 
| 
      
 1443 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1444 
     | 
    
         
            +
                    const double *src1 = src+src_stride;
         
     | 
| 
      
 1445 
     | 
    
         
            +
                    double       *dst1 = dst+32;
         
     | 
| 
      
 1446 
     | 
    
         
            +
                    if( opsrc_width==2 )
         
     | 
| 
      
 1447 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1448 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1449 
     | 
    
         
            +
                         * Width=2
         
     | 
| 
      
 1450 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1451 
     | 
    
         
            +
                        for(i=0; i<32; i+=8)
         
     | 
| 
      
 1452 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1453 
     | 
    
         
            +
                            _mm256_store_pd(dst+i,    _mm256_loadu_pd(src+i));
         
     | 
| 
      
 1454 
     | 
    
         
            +
                            _mm256_store_pd(dst+i+4,  _mm256_loadu_pd(src+i+4));
         
     | 
| 
      
 1455 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i,   _mm256_loadu_pd(src1+i));
         
     | 
| 
      
 1456 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
         
     | 
| 
      
 1457 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1458 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1459 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1460 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1461 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1462 
     | 
    
         
            +
                         * Width=1, pad by zeros
         
     | 
| 
      
 1463 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1464 
     | 
    
         
            +
                        __m256d vz = _mm256_setzero_pd();
         
     | 
| 
      
 1465 
     | 
    
         
            +
                        for(i=0; i<32; i+=8)
         
     | 
| 
      
 1466 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1467 
     | 
    
         
            +
                            _mm256_store_pd(dst+i,    _mm256_loadu_pd(src+i));
         
     | 
| 
      
 1468 
     | 
    
         
            +
                            _mm256_store_pd(dst+i+4,  _mm256_loadu_pd(src+i+4));
         
     | 
| 
      
 1469 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i,   vz);
         
     | 
| 
      
 1470 
     | 
    
         
            +
                            _mm256_store_pd(dst1+i+4, vz);
         
     | 
| 
      
 1471 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1472 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1473 
     | 
    
         
            +
                }
         
     | 
| 
      
 1474 
     | 
    
         
            +
                else
         
     | 
| 
      
 1475 
     | 
    
         
            +
                {
         
     | 
| 
      
 1476 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1477 
     | 
    
         
            +
                     * Copy with transposition
         
     | 
| 
      
 1478 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1479 
     | 
    
         
            +
                    const ae_int_t stride2 = src_stride<<1;
         
     | 
| 
      
 1480 
     | 
    
         
            +
                    const ae_int_t stride3 = src_stride+stride2;
         
     | 
| 
      
 1481 
     | 
    
         
            +
                    const ae_int_t stride4 = src_stride<<2;
         
     | 
| 
      
 1482 
     | 
    
         
            +
                    const double *srci = src;
         
     | 
| 
      
 1483 
     | 
    
         
            +
                    double       *dst1 = dst+32;
         
     | 
| 
      
 1484 
     | 
    
         
            +
                    if( opsrc_width==2 )
         
     | 
| 
      
 1485 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1486 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1487 
     | 
    
         
            +
                         * Width=2
         
     | 
| 
      
 1488 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1489 
     | 
    
         
            +
                        for(i=0; i<32; i+=4)
         
     | 
| 
      
 1490 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1491 
     | 
    
         
            +
                            __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
         
     | 
| 
      
 1492 
     | 
    
         
            +
                            __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
         
     | 
| 
      
 1493 
     | 
    
         
            +
                            _mm_store_pd(dst+i,    _mm_unpacklo_pd(s0,s1));
         
     | 
| 
      
 1494 
     | 
    
         
            +
                            _mm_store_pd(dst1+i,   _mm_unpackhi_pd(s0,s1));
         
     | 
| 
      
 1495 
     | 
    
         
            +
                            _mm_store_pd(dst+i+2,  _mm_unpacklo_pd(s2,s3));
         
     | 
| 
      
 1496 
     | 
    
         
            +
                            _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
         
     | 
| 
      
 1497 
     | 
    
         
            +
                            srci += stride4;
         
     | 
| 
      
 1498 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1499 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1500 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1501 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1502 
     | 
    
         
            +
                        /*
         
     | 
| 
      
 1503 
     | 
    
         
            +
                         * Width=1, pad by zeros
         
     | 
| 
      
 1504 
     | 
    
         
            +
                         */
         
     | 
| 
      
 1505 
     | 
    
         
            +
                        __m128d vz = _mm_setzero_pd();
         
     | 
| 
      
 1506 
     | 
    
         
            +
                        for(i=0; i<32; i+=4)
         
     | 
| 
      
 1507 
     | 
    
         
            +
                        {
         
     | 
| 
      
 1508 
     | 
    
         
            +
                            __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
         
     | 
| 
      
 1509 
     | 
    
         
            +
                            __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
         
     | 
| 
      
 1510 
     | 
    
         
            +
                            _mm_store_pd(dst+i,    _mm_unpacklo_pd(s0,s1));
         
     | 
| 
      
 1511 
     | 
    
         
            +
                            _mm_store_pd(dst+i+2,  _mm_unpacklo_pd(s2,s3));
         
     | 
| 
      
 1512 
     | 
    
         
            +
                            _mm_store_pd(dst1+i,   vz);
         
     | 
| 
      
 1513 
     | 
    
         
            +
                            _mm_store_pd(dst1+i+2, vz);
         
     | 
| 
      
 1514 
     | 
    
         
            +
                            srci += stride4;
         
     | 
| 
      
 1515 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1516 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1517 
     | 
    
         
            +
                }
         
     | 
| 
      
 1518 
     | 
    
         
            +
                return 32;
         
     | 
| 
      
 1519 
     | 
    
         
            +
            }
         
     | 
| 
      
 1520 
     | 
    
         
            +
             
     | 
| 
      
 1521 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 1522 
     | 
    
         
            +
            Computes  product   A*transpose(B)  of two MICRO_SIZE*ROUND_LENGTH rowwise 
         
     | 
| 
      
 1523 
     | 
    
         
            +
            'horizontal' matrices, stored with stride=block_size, and writes it to the
         
     | 
| 
      
 1524 
     | 
    
         
            +
            row-wise matrix C.
         
     | 
| 
      
 1525 
     | 
    
         
            +
             
     | 
| 
      
 1526 
     | 
    
         
            +
            ROUND_LENGTH is expected to be properly SIMD-rounded length,  as  returned
         
     | 
| 
      
 1527 
     | 
    
         
            +
            by ablasf_packblkh_avx2().
         
     | 
| 
      
 1528 
     | 
    
         
            +
             
     | 
| 
      
 1529 
     | 
    
         
            +
            Present version of the function supports only MICRO_SIZE=2,  the  behavior
         
     | 
| 
      
 1530 
     | 
    
         
            +
            is undefined for other micro sizes.
         
     | 
| 
      
 1531 
     | 
    
         
            +
             
     | 
| 
      
 1532 
     | 
    
         
            +
            Requires AVX2, does NOT check its presense.
         
     | 
| 
      
 1533 
     | 
    
         
            +
             
     | 
| 
      
 1534 
     | 
    
         
            +
              -- ALGLIB routine --
         
     | 
| 
      
 1535 
     | 
    
         
            +
                 19.07.2021
         
     | 
| 
      
 1536 
     | 
    
         
            +
                 Bochkanov Sergey
         
     | 
| 
      
 1537 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 1538 
     | 
    
         
            +
            void ablasf_dotblkh_avx2(
         
     | 
| 
      
 1539 
     | 
    
         
            +
                const double *src_a,
         
     | 
| 
      
 1540 
     | 
    
         
            +
                const double *src_b,
         
     | 
| 
      
 1541 
     | 
    
         
            +
                ae_int_t round_length,
         
     | 
| 
      
 1542 
     | 
    
         
            +
                ae_int_t block_size,
         
     | 
| 
      
 1543 
     | 
    
         
            +
                ae_int_t micro_size,
         
     | 
| 
      
 1544 
     | 
    
         
            +
                double *dst,
         
     | 
| 
      
 1545 
     | 
    
         
            +
                ae_int_t dst_stride)
         
     | 
| 
      
 1546 
     | 
    
         
            +
            {
         
     | 
| 
      
 1547 
     | 
    
         
            +
                ae_int_t z;
         
     | 
| 
      
 1548 
     | 
    
         
            +
                __m256d r00 = _mm256_setzero_pd(), r01 = _mm256_setzero_pd(), r10 = _mm256_setzero_pd(), r11 = _mm256_setzero_pd();
         
     | 
| 
      
 1549 
     | 
    
         
            +
                if( round_length&0x7 )
         
     | 
| 
      
 1550 
     | 
    
         
            +
                {
         
     | 
| 
      
 1551 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1552 
     | 
    
         
            +
                     * round_length is multiple of 4, but not multiple of 8
         
     | 
| 
      
 1553 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1554 
     | 
    
         
            +
                    for(z=0; z<round_length; z+=4, src_a+=4, src_b+=4)
         
     | 
| 
      
 1555 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1556 
     | 
    
         
            +
                        __m256d a0 = _mm256_load_pd(src_a);
         
     | 
| 
      
 1557 
     | 
    
         
            +
                        __m256d a1 = _mm256_load_pd(src_a+block_size);
         
     | 
| 
      
 1558 
     | 
    
         
            +
                        __m256d b0 = _mm256_load_pd(src_b);
         
     | 
| 
      
 1559 
     | 
    
         
            +
                        __m256d b1 = _mm256_load_pd(src_b+block_size);
         
     | 
| 
      
 1560 
     | 
    
         
            +
                        r00 = _mm256_add_pd(_mm256_mul_pd(a0, b0), r00);
         
     | 
| 
      
 1561 
     | 
    
         
            +
                        r01 = _mm256_add_pd(_mm256_mul_pd(a0, b1), r01);
         
     | 
| 
      
 1562 
     | 
    
         
            +
                        r10 = _mm256_add_pd(_mm256_mul_pd(a1, b0), r10);
         
     | 
| 
      
 1563 
     | 
    
         
            +
                        r11 = _mm256_add_pd(_mm256_mul_pd(a1, b1), r11);
         
     | 
| 
      
 1564 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1565 
     | 
    
         
            +
                }
         
     | 
| 
      
 1566 
     | 
    
         
            +
                else
         
     | 
| 
      
 1567 
     | 
    
         
            +
                {
         
     | 
| 
      
 1568 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1569 
     | 
    
         
            +
                     * round_length is multiple of 8
         
     | 
| 
      
 1570 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1571 
     | 
    
         
            +
                    for(z=0; z<round_length; z+=8, src_a+=8, src_b+=8)
         
     | 
| 
      
 1572 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1573 
     | 
    
         
            +
                        __m256d a0 = _mm256_load_pd(src_a);
         
     | 
| 
      
 1574 
     | 
    
         
            +
                        __m256d a1 = _mm256_load_pd(src_a+block_size);
         
     | 
| 
      
 1575 
     | 
    
         
            +
                        __m256d b0 = _mm256_load_pd(src_b);
         
     | 
| 
      
 1576 
     | 
    
         
            +
                        __m256d b1 = _mm256_load_pd(src_b+block_size);
         
     | 
| 
      
 1577 
     | 
    
         
            +
                        __m256d c0 = _mm256_load_pd(src_a+4);
         
     | 
| 
      
 1578 
     | 
    
         
            +
                        __m256d c1 = _mm256_load_pd(src_a+block_size+4);
         
     | 
| 
      
 1579 
     | 
    
         
            +
                        __m256d d0 = _mm256_load_pd(src_b+4);
         
     | 
| 
      
 1580 
     | 
    
         
            +
                        __m256d d1 = _mm256_load_pd(src_b+block_size+4);
         
     | 
| 
      
 1581 
     | 
    
         
            +
                        r00 = _mm256_add_pd(_mm256_add_pd(r00, _mm256_mul_pd(a0, b0)), _mm256_mul_pd(c0, d0));
         
     | 
| 
      
 1582 
     | 
    
         
            +
                        r01 = _mm256_add_pd(_mm256_add_pd(r01, _mm256_mul_pd(a0, b1)), _mm256_mul_pd(c0, d1));
         
     | 
| 
      
 1583 
     | 
    
         
            +
                        r10 = _mm256_add_pd(_mm256_add_pd(r10, _mm256_mul_pd(a1, b0)), _mm256_mul_pd(c1, d0));
         
     | 
| 
      
 1584 
     | 
    
         
            +
                        r11 = _mm256_add_pd(_mm256_add_pd(r11, _mm256_mul_pd(a1, b1)), _mm256_mul_pd(c1, d1));
         
     | 
| 
      
 1585 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1586 
     | 
    
         
            +
                }
         
     | 
| 
      
 1587 
     | 
    
         
            +
                __m256d sum0 = _mm256_hadd_pd(r00,r01);
         
     | 
| 
      
 1588 
     | 
    
         
            +
                __m256d sum1 = _mm256_hadd_pd(r10,r11);
         
     | 
| 
      
 1589 
     | 
    
         
            +
                _mm_store_pd(dst,            _mm_add_pd(_mm256_castpd256_pd128(sum0), _mm256_extractf128_pd(sum0,1)));
         
     | 
| 
      
 1590 
     | 
    
         
            +
                _mm_store_pd(dst+dst_stride, _mm_add_pd(_mm256_castpd256_pd128(sum1), _mm256_extractf128_pd(sum1,1)));
         
     | 
| 
      
 1591 
     | 
    
         
            +
            }
         
     | 
| 
      
 1592 
     | 
    
         
            +
             
     | 
| 
      
 1593 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 1594 
     | 
    
         
            +
            Y := alpha*X + beta*Y
         
     | 
| 
      
 1595 
     | 
    
         
            +
             
     | 
| 
      
 1596 
     | 
    
         
            +
            Requires AVX2, does NOT check its presense.
         
     | 
| 
      
 1597 
     | 
    
         
            +
             
     | 
| 
      
 1598 
     | 
    
         
            +
              -- ALGLIB routine --
         
     | 
| 
      
 1599 
     | 
    
         
            +
                 19.07.2021
         
     | 
| 
      
 1600 
     | 
    
         
            +
                 Bochkanov Sergey
         
     | 
| 
      
 1601 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 1602 
     | 
    
         
            +
            void ablasf_daxpby_avx2(
         
     | 
| 
      
 1603 
     | 
    
         
            +
                ae_int_t     n,
         
     | 
| 
      
 1604 
     | 
    
         
            +
                double       alpha,
         
     | 
| 
      
 1605 
     | 
    
         
            +
                const double *src,
         
     | 
| 
      
 1606 
     | 
    
         
            +
                double       beta,
         
     | 
| 
      
 1607 
     | 
    
         
            +
                double       *dst)
         
     | 
| 
      
 1608 
     | 
    
         
            +
            {
         
     | 
| 
      
 1609 
     | 
    
         
            +
                if( beta==1.0 )
         
     | 
| 
      
 1610 
     | 
    
         
            +
                {
         
     | 
| 
      
 1611 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1612 
     | 
    
         
            +
                     * The most optimized case: DST := alpha*SRC + DST
         
     | 
| 
      
 1613 
     | 
    
         
            +
                     *
         
     | 
| 
      
 1614 
     | 
    
         
            +
                     * First, we process leading elements with generic C code until DST is aligned.
         
     | 
| 
      
 1615 
     | 
    
         
            +
                     * Then, we process central part, assuming that DST is properly aligned.
         
     | 
| 
      
 1616 
     | 
    
         
            +
                     * Finally, we process tail.
         
     | 
| 
      
 1617 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1618 
     | 
    
         
            +
                    ae_int_t i, n4;
         
     | 
| 
      
 1619 
     | 
    
         
            +
                    __m256d avx_alpha = _mm256_set1_pd(alpha);
         
     | 
| 
      
 1620 
     | 
    
         
            +
                    while( n>0 && (((ptrdiff_t)dst)&31) )
         
     | 
| 
      
 1621 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1622 
     | 
    
         
            +
                        *dst += alpha*(*src);
         
     | 
| 
      
 1623 
     | 
    
         
            +
                        n--;
         
     | 
| 
      
 1624 
     | 
    
         
            +
                        dst++;
         
     | 
| 
      
 1625 
     | 
    
         
            +
                        src++;
         
     | 
| 
      
 1626 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1627 
     | 
    
         
            +
                    n4=(n>>2)<<2;
         
     | 
| 
      
 1628 
     | 
    
         
            +
                    for(i=0; i<n4; i+=4)
         
     | 
| 
      
 1629 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1630 
     | 
    
         
            +
                        __m256d r = _mm256_add_pd(_mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)), _mm256_load_pd(dst+i));
         
     | 
| 
      
 1631 
     | 
    
         
            +
                        _mm256_store_pd(dst+i, r);
         
     | 
| 
      
 1632 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1633 
     | 
    
         
            +
                    for(i=n4; i<n; i++)
         
     | 
| 
      
 1634 
     | 
    
         
            +
                        dst[i] = alpha*src[i]+dst[i];
         
     | 
| 
      
 1635 
     | 
    
         
            +
                }
         
     | 
| 
      
 1636 
     | 
    
         
            +
                else if( beta!=0.0 )
         
     | 
| 
      
 1637 
     | 
    
         
            +
                {
         
     | 
| 
      
 1638 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1639 
     | 
    
         
            +
                     * Well optimized: DST := alpha*SRC + beta*DST
         
     | 
| 
      
 1640 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1641 
     | 
    
         
            +
                    ae_int_t i, n4;
         
     | 
| 
      
 1642 
     | 
    
         
            +
                    __m256d avx_alpha = _mm256_set1_pd(alpha);
         
     | 
| 
      
 1643 
     | 
    
         
            +
                    __m256d avx_beta  = _mm256_set1_pd(beta);
         
     | 
| 
      
 1644 
     | 
    
         
            +
                    while( n>0 && (((ptrdiff_t)dst)&31) )
         
     | 
| 
      
 1645 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1646 
     | 
    
         
            +
                        *dst = alpha*(*src) + beta*(*dst);
         
     | 
| 
      
 1647 
     | 
    
         
            +
                        n--;
         
     | 
| 
      
 1648 
     | 
    
         
            +
                        dst++;
         
     | 
| 
      
 1649 
     | 
    
         
            +
                        src++;
         
     | 
| 
      
 1650 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1651 
     | 
    
         
            +
                    n4=(n>>2)<<2;
         
     | 
| 
      
 1652 
     | 
    
         
            +
                    for(i=0; i<n4; i+=4)
         
     | 
| 
      
 1653 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1654 
     | 
    
         
            +
                        __m256d r = _mm256_add_pd(
         
     | 
| 
      
 1655 
     | 
    
         
            +
                            _mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)),
         
     | 
| 
      
 1656 
     | 
    
         
            +
                            _mm256_mul_pd(avx_beta,_mm256_load_pd(dst+i)));
         
     | 
| 
      
 1657 
     | 
    
         
            +
                        _mm256_store_pd(dst+i, r);
         
     | 
| 
      
 1658 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1659 
     | 
    
         
            +
                    for(i=n4; i<n; i++)
         
     | 
| 
      
 1660 
     | 
    
         
            +
                        dst[i] = alpha*src[i]+beta*dst[i];
         
     | 
| 
      
 1661 
     | 
    
         
            +
                }
         
     | 
| 
      
 1662 
     | 
    
         
            +
                else
         
     | 
| 
      
 1663 
     | 
    
         
            +
                {
         
     | 
| 
      
 1664 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1665 
     | 
    
         
            +
                     * Easy case: DST := alpha*SRC
         
     | 
| 
      
 1666 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1667 
     | 
    
         
            +
                    ae_int_t i;
         
     | 
| 
      
 1668 
     | 
    
         
            +
                    for(i=0; i<n; i++)
         
     | 
| 
      
 1669 
     | 
    
         
            +
                        dst[i] = alpha*src[i];
         
     | 
| 
      
 1670 
     | 
    
         
            +
                }
         
     | 
| 
      
 1671 
     | 
    
         
            +
            }
         
     | 
| 
      
 1672 
     | 
    
         
            +
             
     | 
| 
      
 1673 
     | 
    
         
            +
            ae_bool spchol_updatekernelabc4_avx2(double* rowstorage,
         
     | 
| 
      
 1674 
     | 
    
         
            +
                 ae_int_t offss,
         
     | 
| 
      
 1675 
     | 
    
         
            +
                 ae_int_t twidth,
         
     | 
| 
      
 1676 
     | 
    
         
            +
                 ae_int_t offsu,
         
     | 
| 
      
 1677 
     | 
    
         
            +
                 ae_int_t uheight,
         
     | 
| 
      
 1678 
     | 
    
         
            +
                 ae_int_t urank,
         
     | 
| 
      
 1679 
     | 
    
         
            +
                 ae_int_t urowstride,
         
     | 
| 
      
 1680 
     | 
    
         
            +
                 ae_int_t uwidth,
         
     | 
| 
      
 1681 
     | 
    
         
            +
                 const double* diagd,
         
     | 
| 
      
 1682 
     | 
    
         
            +
                 ae_int_t offsd,
         
     | 
| 
      
 1683 
     | 
    
         
            +
                 const ae_int_t* raw2smap,
         
     | 
| 
      
 1684 
     | 
    
         
            +
                 const ae_int_t* superrowidx,
         
     | 
| 
      
 1685 
     | 
    
         
            +
                 ae_int_t urbase,
         
     | 
| 
      
 1686 
     | 
    
         
            +
                 ae_state *_state)
         
     | 
| 
      
 1687 
     | 
    
         
            +
            {
         
     | 
| 
      
 1688 
     | 
    
         
            +
                ae_int_t k;
         
     | 
| 
      
 1689 
     | 
    
         
            +
                ae_int_t targetrow;
         
     | 
| 
      
 1690 
     | 
    
         
            +
                ae_int_t targetcol;
         
     | 
| 
      
 1691 
     | 
    
         
            +
                
         
     | 
| 
      
 1692 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1693 
     | 
    
         
            +
                 * Filter out unsupported combinations (ones that are too sparse for the non-SIMD code)
         
     | 
| 
      
 1694 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1695 
     | 
    
         
            +
                if( twidth<3||twidth>4 )
         
     | 
| 
      
 1696 
     | 
    
         
            +
                {
         
     | 
| 
      
 1697 
     | 
    
         
            +
                    return ae_false;
         
     | 
| 
      
 1698 
     | 
    
         
            +
                }
         
     | 
| 
      
 1699 
     | 
    
         
            +
                if( uwidth<1||uwidth>4 )
         
     | 
| 
      
 1700 
     | 
    
         
            +
                {
         
     | 
| 
      
 1701 
     | 
    
         
            +
                    return ae_false;
         
     | 
| 
      
 1702 
     | 
    
         
            +
                }
         
     | 
| 
      
 1703 
     | 
    
         
            +
                if( urank>4 )
         
     | 
| 
      
 1704 
     | 
    
         
            +
                {
         
     | 
| 
      
 1705 
     | 
    
         
            +
                    return ae_false;
         
     | 
| 
      
 1706 
     | 
    
         
            +
                }
         
     | 
| 
      
 1707 
     | 
    
         
            +
                
         
     | 
| 
      
 1708 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1709 
     | 
    
         
            +
                 * Shift input arrays to the beginning of the working area.
         
     | 
| 
      
 1710 
     | 
    
         
            +
                 * Prepare SIMD masks
         
     | 
| 
      
 1711 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1712 
     | 
    
         
            +
                __m256i v_rankmask = _mm256_cmpgt_epi64(_mm256_set_epi64x(urank, urank, urank, urank), _mm256_set_epi64x(3, 2, 1, 0));
         
     | 
| 
      
 1713 
     | 
    
         
            +
                double *update_storage = rowstorage+offsu;
         
     | 
| 
      
 1714 
     | 
    
         
            +
                double *target_storage = rowstorage+offss;
         
     | 
| 
      
 1715 
     | 
    
         
            +
                superrowidx += urbase;
         
     | 
| 
      
 1716 
     | 
    
         
            +
                
         
     | 
| 
      
 1717 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1718 
     | 
    
         
            +
                 * Load head of the update matrix
         
     | 
| 
      
 1719 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1720 
     | 
    
         
            +
                __m256d v_d0123 = _mm256_maskload_pd(diagd+offsd, v_rankmask);
         
     | 
| 
      
 1721 
     | 
    
         
            +
                __m256d u_0_0123 = _mm256_setzero_pd();
         
     | 
| 
      
 1722 
     | 
    
         
            +
                __m256d u_1_0123 = _mm256_setzero_pd();
         
     | 
| 
      
 1723 
     | 
    
         
            +
                __m256d u_2_0123 = _mm256_setzero_pd();
         
     | 
| 
      
 1724 
     | 
    
         
            +
                __m256d u_3_0123 = _mm256_setzero_pd();
         
     | 
| 
      
 1725 
     | 
    
         
            +
                for(k=0; k<=uwidth-1; k++)
         
     | 
| 
      
 1726 
     | 
    
         
            +
                {
         
     | 
| 
      
 1727 
     | 
    
         
            +
                    targetcol = raw2smap[superrowidx[k]];
         
     | 
| 
      
 1728 
     | 
    
         
            +
                    if( targetcol==0 )
         
     | 
| 
      
 1729 
     | 
    
         
            +
                        u_0_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
         
     | 
| 
      
 1730 
     | 
    
         
            +
                    if( targetcol==1 )
         
     | 
| 
      
 1731 
     | 
    
         
            +
                        u_1_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
         
     | 
| 
      
 1732 
     | 
    
         
            +
                    if( targetcol==2 )
         
     | 
| 
      
 1733 
     | 
    
         
            +
                        u_2_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
         
     | 
| 
      
 1734 
     | 
    
         
            +
                    if( targetcol==3 )
         
     | 
| 
      
 1735 
     | 
    
         
            +
                        u_3_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
         
     | 
| 
      
 1736 
     | 
    
         
            +
                }
         
     | 
| 
      
 1737 
     | 
    
         
            +
                
         
     | 
| 
      
 1738 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1739 
     | 
    
         
            +
                 * Transpose head
         
     | 
| 
      
 1740 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1741 
     | 
    
         
            +
                __m256d u01_lo = _mm256_unpacklo_pd(u_0_0123,u_1_0123);
         
     | 
| 
      
 1742 
     | 
    
         
            +
                __m256d u01_hi = _mm256_unpackhi_pd(u_0_0123,u_1_0123);
         
     | 
| 
      
 1743 
     | 
    
         
            +
                __m256d u23_lo = _mm256_unpacklo_pd(u_2_0123,u_3_0123);
         
     | 
| 
      
 1744 
     | 
    
         
            +
                __m256d u23_hi = _mm256_unpackhi_pd(u_2_0123,u_3_0123);
         
     | 
| 
      
 1745 
     | 
    
         
            +
                __m256d u_0123_0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
         
     | 
| 
      
 1746 
     | 
    
         
            +
                __m256d u_0123_1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
         
     | 
| 
      
 1747 
     | 
    
         
            +
                __m256d u_0123_2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
         
     | 
| 
      
 1748 
     | 
    
         
            +
                __m256d u_0123_3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
         
     | 
| 
      
 1749 
     | 
    
         
            +
                
         
     | 
| 
      
 1750 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1751 
     | 
    
         
            +
                 * Run update
         
     | 
| 
      
 1752 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1753 
     | 
    
         
            +
                if( urank==1 )
         
     | 
| 
      
 1754 
     | 
    
         
            +
                {
         
     | 
| 
      
 1755 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1756 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1757 
     | 
    
         
            +
                        targetrow = raw2smap[superrowidx[k]]*4;
         
     | 
| 
      
 1758 
     | 
    
         
            +
                        double *update_row = rowstorage+offsu+k*urowstride;
         
     | 
| 
      
 1759 
     | 
    
         
            +
                        _mm256_store_pd(target_storage+targetrow,
         
     | 
| 
      
 1760 
     | 
    
         
            +
                            _mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
         
     | 
| 
      
 1761 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
         
     | 
| 
      
 1762 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1763 
     | 
    
         
            +
                }
         
     | 
| 
      
 1764 
     | 
    
         
            +
                if( urank==2 )
         
     | 
| 
      
 1765 
     | 
    
         
            +
                {
         
     | 
| 
      
 1766 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1767 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1768 
     | 
    
         
            +
                        targetrow = raw2smap[superrowidx[k]]*4;
         
     | 
| 
      
 1769 
     | 
    
         
            +
                        double *update_row = rowstorage+offsu+k*urowstride;
         
     | 
| 
      
 1770 
     | 
    
         
            +
                        _mm256_store_pd(target_storage+targetrow,
         
     | 
| 
      
 1771 
     | 
    
         
            +
                            _mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
         
     | 
| 
      
 1772 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
         
     | 
| 
      
 1773 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
         
     | 
| 
      
 1774 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1775 
     | 
    
         
            +
                }
         
     | 
| 
      
 1776 
     | 
    
         
            +
                if( urank==3 )
         
     | 
| 
      
 1777 
     | 
    
         
            +
                {
         
     | 
| 
      
 1778 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1779 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1780 
     | 
    
         
            +
                        targetrow = raw2smap[superrowidx[k]]*4;
         
     | 
| 
      
 1781 
     | 
    
         
            +
                        double *update_row = rowstorage+offsu+k*urowstride;
         
     | 
| 
      
 1782 
     | 
    
         
            +
                        _mm256_store_pd(target_storage+targetrow,
         
     | 
| 
      
 1783 
     | 
    
         
            +
                            _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
         
     | 
| 
      
 1784 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
         
     | 
| 
      
 1785 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
         
     | 
| 
      
 1786 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
         
     | 
| 
      
 1787 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1788 
     | 
    
         
            +
                }
         
     | 
| 
      
 1789 
     | 
    
         
            +
                if( urank==4 )
         
     | 
| 
      
 1790 
     | 
    
         
            +
                {
         
     | 
| 
      
 1791 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1792 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1793 
     | 
    
         
            +
                        targetrow = raw2smap[superrowidx[k]]*4;
         
     | 
| 
      
 1794 
     | 
    
         
            +
                        double *update_row = rowstorage+offsu+k*urowstride;
         
     | 
| 
      
 1795 
     | 
    
         
            +
                        _mm256_store_pd(target_storage+targetrow,
         
     | 
| 
      
 1796 
     | 
    
         
            +
                            _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
         
     | 
| 
      
 1797 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+3), u_0123_3)),
         
     | 
| 
      
 1798 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
         
     | 
| 
      
 1799 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
         
     | 
| 
      
 1800 
     | 
    
         
            +
                                _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
         
     | 
| 
      
 1801 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1802 
     | 
    
         
            +
                }
         
     | 
| 
      
 1803 
     | 
    
         
            +
                return ae_true;
         
     | 
| 
      
 1804 
     | 
    
         
            +
            }
         
     | 
| 
      
 1805 
     | 
    
         
            +
             
     | 
| 
      
 1806 
     | 
    
         
            +
            ae_bool spchol_updatekernel4444_avx2(
         
     | 
| 
      
 1807 
     | 
    
         
            +
                 double*  rowstorage,
         
     | 
| 
      
 1808 
     | 
    
         
            +
                 ae_int_t offss,
         
     | 
| 
      
 1809 
     | 
    
         
            +
                 ae_int_t sheight,
         
     | 
| 
      
 1810 
     | 
    
         
            +
                 ae_int_t offsu,
         
     | 
| 
      
 1811 
     | 
    
         
            +
                 ae_int_t uheight,
         
     | 
| 
      
 1812 
     | 
    
         
            +
                 const double*  diagd,
         
     | 
| 
      
 1813 
     | 
    
         
            +
                 ae_int_t offsd,
         
     | 
| 
      
 1814 
     | 
    
         
            +
                 const ae_int_t* raw2smap,
         
     | 
| 
      
 1815 
     | 
    
         
            +
                 const ae_int_t* superrowidx,
         
     | 
| 
      
 1816 
     | 
    
         
            +
                 ae_int_t urbase,
         
     | 
| 
      
 1817 
     | 
    
         
            +
                 ae_state *_state)
         
     | 
| 
      
 1818 
     | 
    
         
            +
            {
         
     | 
| 
      
 1819 
     | 
    
         
            +
                ae_int_t k;
         
     | 
| 
      
 1820 
     | 
    
         
            +
                ae_int_t targetrow;
         
     | 
| 
      
 1821 
     | 
    
         
            +
                ae_int_t offsk;
         
     | 
| 
      
 1822 
     | 
    
         
            +
                __m256d v_negd_u0, v_negd_u1, v_negd_u2, v_negd_u3, v_negd;
         
     | 
| 
      
 1823 
     | 
    
         
            +
                __m256d v_w0, v_w1, v_w2, v_w3, u01_lo, u01_hi, u23_lo, u23_hi;
         
     | 
| 
      
 1824 
     | 
    
         
            +
                
         
     | 
| 
      
 1825 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1826 
     | 
    
         
            +
                 * Compute W = -D*transpose(U[0:3])
         
     | 
| 
      
 1827 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1828 
     | 
    
         
            +
                v_negd = _mm256_mul_pd(_mm256_loadu_pd(diagd+offsd),_mm256_set1_pd(-1.0));
         
     | 
| 
      
 1829 
     | 
    
         
            +
                v_negd_u0   = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+0*4),v_negd);
         
     | 
| 
      
 1830 
     | 
    
         
            +
                v_negd_u1   = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+1*4),v_negd);
         
     | 
| 
      
 1831 
     | 
    
         
            +
                v_negd_u2   = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+2*4),v_negd);
         
     | 
| 
      
 1832 
     | 
    
         
            +
                v_negd_u3   = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+3*4),v_negd);
         
     | 
| 
      
 1833 
     | 
    
         
            +
                u01_lo = _mm256_unpacklo_pd(v_negd_u0,v_negd_u1);
         
     | 
| 
      
 1834 
     | 
    
         
            +
                u01_hi = _mm256_unpackhi_pd(v_negd_u0,v_negd_u1);
         
     | 
| 
      
 1835 
     | 
    
         
            +
                u23_lo = _mm256_unpacklo_pd(v_negd_u2,v_negd_u3);
         
     | 
| 
      
 1836 
     | 
    
         
            +
                u23_hi = _mm256_unpackhi_pd(v_negd_u2,v_negd_u3);
         
     | 
| 
      
 1837 
     | 
    
         
            +
                v_w0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
         
     | 
| 
      
 1838 
     | 
    
         
            +
                v_w1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
         
     | 
| 
      
 1839 
     | 
    
         
            +
                v_w2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
         
     | 
| 
      
 1840 
     | 
    
         
            +
                v_w3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
         
     | 
| 
      
 1841 
     | 
    
         
            +
                
         
     | 
| 
      
 1842 
     | 
    
         
            +
                //
         
     | 
| 
      
 1843 
     | 
    
         
            +
                // Compute update S:= S + row_scatter(U*W)
         
     | 
| 
      
 1844 
     | 
    
         
            +
                //
         
     | 
| 
      
 1845 
     | 
    
         
            +
                if( sheight==uheight )
         
     | 
| 
      
 1846 
     | 
    
         
            +
                {
         
     | 
| 
      
 1847 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1848 
     | 
    
         
            +
                     * No row scatter, the most efficient code
         
     | 
| 
      
 1849 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1850 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1851 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1852 
     | 
    
         
            +
                        __m256d target;
         
     | 
| 
      
 1853 
     | 
    
         
            +
                        
         
     | 
| 
      
 1854 
     | 
    
         
            +
                        targetrow = offss+k*4;
         
     | 
| 
      
 1855 
     | 
    
         
            +
                        offsk = offsu+k*4;
         
     | 
| 
      
 1856 
     | 
    
         
            +
                        
         
     | 
| 
      
 1857 
     | 
    
         
            +
                        target = _mm256_load_pd(rowstorage+targetrow);
         
     | 
| 
      
 1858 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0),target);
         
     | 
| 
      
 1859 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1),target);
         
     | 
| 
      
 1860 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2),target);
         
     | 
| 
      
 1861 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3),target);
         
     | 
| 
      
 1862 
     | 
    
         
            +
                        _mm256_store_pd(rowstorage+targetrow, target);
         
     | 
| 
      
 1863 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1864 
     | 
    
         
            +
                }
         
     | 
| 
      
 1865 
     | 
    
         
            +
                else
         
     | 
| 
      
 1866 
     | 
    
         
            +
                {
         
     | 
| 
      
 1867 
     | 
    
         
            +
                    /*
         
     | 
| 
      
 1868 
     | 
    
         
            +
                     * Row scatter is performed, less efficient code using double mapping to determine target row index
         
     | 
| 
      
 1869 
     | 
    
         
            +
                     */
         
     | 
| 
      
 1870 
     | 
    
         
            +
                    for(k=0; k<=uheight-1; k++)
         
     | 
| 
      
 1871 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1872 
     | 
    
         
            +
                        __m256d v_uk0, v_uk1, v_uk2, v_uk3, target;
         
     | 
| 
      
 1873 
     | 
    
         
            +
                        
         
     | 
| 
      
 1874 
     | 
    
         
            +
                        targetrow = offss+raw2smap[superrowidx[urbase+k]]*4;
         
     | 
| 
      
 1875 
     | 
    
         
            +
                        offsk = offsu+k*4;
         
     | 
| 
      
 1876 
     | 
    
         
            +
                        
         
     | 
| 
      
 1877 
     | 
    
         
            +
                        target = _mm256_load_pd(rowstorage+targetrow);
         
     | 
| 
      
 1878 
     | 
    
         
            +
                        v_uk0 = _mm256_broadcast_sd(rowstorage+offsk+0);
         
     | 
| 
      
 1879 
     | 
    
         
            +
                        v_uk1 = _mm256_broadcast_sd(rowstorage+offsk+1);
         
     | 
| 
      
 1880 
     | 
    
         
            +
                        v_uk2 = _mm256_broadcast_sd(rowstorage+offsk+2);
         
     | 
| 
      
 1881 
     | 
    
         
            +
                        v_uk3 = _mm256_broadcast_sd(rowstorage+offsk+3);
         
     | 
| 
      
 1882 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(v_uk0,v_w0),target);
         
     | 
| 
      
 1883 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(v_uk1,v_w1),target);
         
     | 
| 
      
 1884 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(v_uk2,v_w2),target);
         
     | 
| 
      
 1885 
     | 
    
         
            +
                        target = _mm256_add_pd(_mm256_mul_pd(v_uk3,v_w3),target);
         
     | 
| 
      
 1886 
     | 
    
         
            +
                        _mm256_store_pd(rowstorage+targetrow, target);
         
     | 
| 
      
 1887 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1888 
     | 
    
         
            +
                }
         
     | 
| 
      
 1889 
     | 
    
         
            +
                return ae_true;
         
     | 
| 
      
 1890 
     | 
    
         
            +
            }
         
     | 
| 
      
 1891 
     | 
    
         
            +
             
     | 
| 
      
 1892 
     | 
    
         
            +
             
     | 
| 
      
 1893 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 1894 
     | 
    
         
            +
            Fast kernel for biharmonic panel with NY=1
         
     | 
| 
      
 1895 
     | 
    
         
            +
             
     | 
| 
      
 1896 
     | 
    
         
            +
            INPUT PARAMETERS:
         
     | 
| 
      
 1897 
     | 
    
         
            +
                D0, D1, D2      -   evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
         
     | 
| 
      
 1898 
     | 
    
         
            +
             
     | 
| 
      
 1899 
     | 
    
         
            +
            OUTPUT PARAMETERS:
         
     | 
| 
      
 1900 
     | 
    
         
            +
                F               -   model value
         
     | 
| 
      
 1901 
     | 
    
         
            +
                InvPowRPPlus1   -   1/(R^(P+1))
         
     | 
| 
      
 1902 
     | 
    
         
            +
             
     | 
| 
      
 1903 
     | 
    
         
            +
              -- ALGLIB --
         
     | 
| 
      
 1904 
     | 
    
         
            +
                 Copyright 26.08.2022 by Sergey Bochkanov
         
     | 
| 
      
 1905 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 1906 
     | 
    
         
            +
            ae_bool rbfv3farfields_bhpaneleval1fastkernel16_avx2(double d0,
         
     | 
| 
      
 1907 
     | 
    
         
            +
                 double d1,
         
     | 
| 
      
 1908 
     | 
    
         
            +
                 double d2,
         
     | 
| 
      
 1909 
     | 
    
         
            +
                 const double* pnma,
         
     | 
| 
      
 1910 
     | 
    
         
            +
                 const double* pnmb,
         
     | 
| 
      
 1911 
     | 
    
         
            +
                 const double* pmmcdiag,
         
     | 
| 
      
 1912 
     | 
    
         
            +
                 const double* ynma,
         
     | 
| 
      
 1913 
     | 
    
         
            +
                 const double* tblrmodmn,
         
     | 
| 
      
 1914 
     | 
    
         
            +
                 double* f,
         
     | 
| 
      
 1915 
     | 
    
         
            +
                 double* invpowrpplus1,
         
     | 
| 
      
 1916 
     | 
    
         
            +
                 ae_state *_state)
         
     | 
| 
      
 1917 
     | 
    
         
            +
            {
         
     | 
| 
      
 1918 
     | 
    
         
            +
                ae_int_t n;
         
     | 
| 
      
 1919 
     | 
    
         
            +
                double r, r2, r01, invr;
         
     | 
| 
      
 1920 
     | 
    
         
            +
                double sintheta, costheta;
         
     | 
| 
      
 1921 
     | 
    
         
            +
                ae_complex expiphi, expiphi2, expiphi3, expiphi4;
         
     | 
| 
      
 1922 
     | 
    
         
            +
                ae_int_t jj;
         
     | 
| 
      
 1923 
     | 
    
         
            +
                ae_bool result;
         
     | 
| 
      
 1924 
     | 
    
         
            +
             
     | 
| 
      
 1925 
     | 
    
         
            +
                *f = 0.0;
         
     | 
| 
      
 1926 
     | 
    
         
            +
                *invpowrpplus1 = 0.0;
         
     | 
| 
      
 1927 
     | 
    
         
            +
                result = ae_true;
         
     | 
| 
      
 1928 
     | 
    
         
            +
                
         
     | 
| 
      
 1929 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1930 
     | 
    
         
            +
                 *Convert to spherical polar coordinates.
         
     | 
| 
      
 1931 
     | 
    
         
            +
                 *
         
     | 
| 
      
 1932 
     | 
    
         
            +
                 * NOTE: we make sure that R is non-zero by adding extremely small perturbation
         
     | 
| 
      
 1933 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1934 
     | 
    
         
            +
                r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
         
     | 
| 
      
 1935 
     | 
    
         
            +
                r = ae_sqrt(r2, _state);
         
     | 
| 
      
 1936 
     | 
    
         
            +
                r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
         
     | 
| 
      
 1937 
     | 
    
         
            +
                costheta = d2/r;
         
     | 
| 
      
 1938 
     | 
    
         
            +
                sintheta = r01/r;
         
     | 
| 
      
 1939 
     | 
    
         
            +
                expiphi.x = d0/r01;
         
     | 
| 
      
 1940 
     | 
    
         
            +
                expiphi.y = d1/r01;
         
     | 
| 
      
 1941 
     | 
    
         
            +
                invr = (double)1/r;
         
     | 
| 
      
 1942 
     | 
    
         
            +
                
         
     | 
| 
      
 1943 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1944 
     | 
    
         
            +
                 * prepare precomputed quantities
         
     | 
| 
      
 1945 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1946 
     | 
    
         
            +
                double powsintheta2 = sintheta*sintheta;
         
     | 
| 
      
 1947 
     | 
    
         
            +
                double powsintheta3 = powsintheta2*sintheta;
         
     | 
| 
      
 1948 
     | 
    
         
            +
                double powsintheta4 = powsintheta2*powsintheta2;
         
     | 
| 
      
 1949 
     | 
    
         
            +
                expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
         
     | 
| 
      
 1950 
     | 
    
         
            +
                expiphi2.y = 2*expiphi.x*expiphi.y;
         
     | 
| 
      
 1951 
     | 
    
         
            +
                expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
         
     | 
| 
      
 1952 
     | 
    
         
            +
                expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
         
     | 
| 
      
 1953 
     | 
    
         
            +
                expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
         
     | 
| 
      
 1954 
     | 
    
         
            +
                expiphi4.y = 2*expiphi2.x*expiphi2.y;
         
     | 
| 
      
 1955 
     | 
    
         
            +
                
         
     | 
| 
      
 1956 
     | 
    
         
            +
                /*
         
     | 
| 
      
 1957 
     | 
    
         
            +
                 * Compute far field expansion for a cluster of basis functions f=r
         
     | 
| 
      
 1958 
     | 
    
         
            +
                 *
         
     | 
| 
      
 1959 
     | 
    
         
            +
                 * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
         
     | 
| 
      
 1960 
     | 
    
         
            +
                 *       whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
         
     | 
| 
      
 1961 
     | 
    
         
            +
                 *       We will perform conversion later.
         
     | 
| 
      
 1962 
     | 
    
         
            +
                 */
         
     | 
| 
      
 1963 
     | 
    
         
            +
                __m256d v_costheta      = _mm256_set1_pd(costheta);
         
     | 
| 
      
 1964 
     | 
    
         
            +
                __m256d v_r2            = _mm256_set1_pd(r2);
         
     | 
| 
      
 1965 
     | 
    
         
            +
                __m256d v_f             = _mm256_setzero_pd();
         
     | 
| 
      
 1966 
     | 
    
         
            +
                __m256d v_invr          = _mm256_set1_pd(invr);
         
     | 
| 
      
 1967 
     | 
    
         
            +
                __m256d v_powsinthetaj  = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
         
     | 
| 
      
 1968 
     | 
    
         
            +
                __m256d v_powsintheta4  = _mm256_set1_pd(powsintheta4);
         
     | 
| 
      
 1969 
     | 
    
         
            +
                __m256d v_expijphix     = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
         
     | 
| 
      
 1970 
     | 
    
         
            +
                __m256d v_expijphiy     = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
         
     | 
| 
      
 1971 
     | 
    
         
            +
                __m256d v_expi4phix     = _mm256_set1_pd(expiphi4.x);
         
     | 
| 
      
 1972 
     | 
    
         
            +
                __m256d v_expi4phiy     = _mm256_set1_pd(expiphi4.y);
         
     | 
| 
      
 1973 
     | 
    
         
            +
                *f = (double)(0);
         
     | 
| 
      
 1974 
     | 
    
         
            +
                for(jj=0; jj<4; jj++)
         
     | 
| 
      
 1975 
     | 
    
         
            +
                {
         
     | 
| 
      
 1976 
     | 
    
         
            +
                    __m256d pnm_cur         = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
         
     | 
| 
      
 1977 
     | 
    
         
            +
                    __m256d v_powrminusj1   = _mm256_set1_pd(invr);
         
     | 
| 
      
 1978 
     | 
    
         
            +
                    for(n=0; n<jj*4; n++)
         
     | 
| 
      
 1979 
     | 
    
         
            +
                        v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
         
     | 
| 
      
 1980 
     | 
    
         
            +
                    for(n=jj*4; n<16; n++)
         
     | 
| 
      
 1981 
     | 
    
         
            +
                    {
         
     | 
| 
      
 1982 
     | 
    
         
            +
                        ae_int_t j0=jj*4;
         
     | 
| 
      
 1983 
     | 
    
         
            +
                        
         
     | 
| 
      
 1984 
     | 
    
         
            +
                        
         
     | 
| 
      
 1985 
     | 
    
         
            +
                        pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
         
     | 
| 
      
 1986 
     | 
    
         
            +
                        pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
         
     | 
| 
      
 1987 
     | 
    
         
            +
                        pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
         
     | 
| 
      
 1988 
     | 
    
         
            +
                        pnm_prev = pnm_cur;
         
     | 
| 
      
 1989 
     | 
    
         
            +
                        pnm_cur  = pnm_new;
         
     | 
| 
      
 1990 
     | 
    
         
            +
                        
         
     | 
| 
      
 1991 
     | 
    
         
            +
                        __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
         
     | 
| 
      
 1992 
     | 
    
         
            +
                        __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
         
     | 
| 
      
 1993 
     | 
    
         
            +
                        __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
         
     | 
| 
      
 1994 
     | 
    
         
            +
                        
         
     | 
| 
      
 1995 
     | 
    
         
            +
                        __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+32)),_mm256_load_pd(tblrmodmn+n*64+j0));
         
     | 
| 
      
 1996 
     | 
    
         
            +
                        __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+48)),_mm256_load_pd(tblrmodmn+n*64+j0+16));
         
     | 
| 
      
 1997 
     | 
    
         
            +
                        
         
     | 
| 
      
 1998 
     | 
    
         
            +
                        __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
         
     | 
| 
      
 1999 
     | 
    
         
            +
                        
         
     | 
| 
      
 2000 
     | 
    
         
            +
                        v_f = _mm256_add_pd(v_f, _mm256_mul_pd(v_powrminusj1, v_z));
         
     | 
| 
      
 2001 
     | 
    
         
            +
                        v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
         
     | 
| 
      
 2002 
     | 
    
         
            +
                    }
         
     | 
| 
      
 2003 
     | 
    
         
            +
                    __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
         
     | 
| 
      
 2004 
     | 
    
         
            +
                    __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
         
     | 
| 
      
 2005 
     | 
    
         
            +
                    v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
         
     | 
| 
      
 2006 
     | 
    
         
            +
                    v_expijphix = v_expijphix_new;
         
     | 
| 
      
 2007 
     | 
    
         
            +
                    v_expijphiy = v_expijphiy_new;
         
     | 
| 
      
 2008 
     | 
    
         
            +
                }
         
     | 
| 
      
 2009 
     | 
    
         
            +
                        
         
     | 
| 
      
 2010 
     | 
    
         
            +
                double ttt[4];
         
     | 
| 
      
 2011 
     | 
    
         
            +
                _mm256_storeu_pd(ttt, v_f);
         
     | 
| 
      
 2012 
     | 
    
         
            +
                for(int k=0; k<4; k++)
         
     | 
| 
      
 2013 
     | 
    
         
            +
                    *f += ttt[k];
         
     | 
| 
      
 2014 
     | 
    
         
            +
                
         
     | 
| 
      
 2015 
     | 
    
         
            +
                double r4 = r2*r2;
         
     | 
| 
      
 2016 
     | 
    
         
            +
                double r8 = r4*r4;
         
     | 
| 
      
 2017 
     | 
    
         
            +
                double r16 = r8*r8;
         
     | 
| 
      
 2018 
     | 
    
         
            +
                *invpowrpplus1 = 1/r16;
         
     | 
| 
      
 2019 
     | 
    
         
            +
                
         
     | 
| 
      
 2020 
     | 
    
         
            +
                return result;
         
     | 
| 
      
 2021 
     | 
    
         
            +
            }
         
     | 
| 
      
 2022 
     | 
    
         
            +
             
     | 
| 
      
 2023 
     | 
    
         
            +
             
     | 
| 
      
 2024 
     | 
    
         
            +
            /*************************************************************************
         
     | 
| 
      
 2025 
     | 
    
         
            +
            Fast kernel for biharmonic panel with general NY
         
     | 
| 
      
 2026 
     | 
    
         
            +
             
     | 
| 
      
 2027 
     | 
    
         
            +
            INPUT PARAMETERS:
         
     | 
| 
      
 2028 
     | 
    
         
            +
                D0, D1, D2      -   evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
         
     | 
| 
      
 2029 
     | 
    
         
            +
             
     | 
| 
      
 2030 
     | 
    
         
            +
            OUTPUT PARAMETERS:
         
     | 
| 
      
 2031 
     | 
    
         
            +
                F               -   array[NY], model value
         
     | 
| 
      
 2032 
     | 
    
         
            +
                InvPowRPPlus1   -   1/(R^(P+1))
         
     | 
| 
      
 2033 
     | 
    
         
            +
             
     | 
| 
      
 2034 
     | 
    
         
            +
              -- ALGLIB --
         
     | 
| 
      
 2035 
     | 
    
         
            +
                 Copyright 26.08.2022 by Sergey Bochkanov
         
     | 
| 
      
 2036 
     | 
    
         
            +
            *************************************************************************/
         
     | 
| 
      
 2037 
     | 
    
         
            +
            ae_bool rbfv3farfields_bhpanelevalfastkernel16_avx2(double d0,
         
     | 
| 
      
 2038 
     | 
    
         
            +
                 double d1,
         
     | 
| 
      
 2039 
     | 
    
         
            +
                 double d2,
         
     | 
| 
      
 2040 
     | 
    
         
            +
                 ae_int_t ny,
         
     | 
| 
      
 2041 
     | 
    
         
            +
                 const double* pnma,
         
     | 
| 
      
 2042 
     | 
    
         
            +
                 const double* pnmb,
         
     | 
| 
      
 2043 
     | 
    
         
            +
                 const double* pmmcdiag,
         
     | 
| 
      
 2044 
     | 
    
         
            +
                 const double* ynma,
         
     | 
| 
      
 2045 
     | 
    
         
            +
                 const double* tblrmodmn,
         
     | 
| 
      
 2046 
     | 
    
         
            +
                 double* f,
         
     | 
| 
      
 2047 
     | 
    
         
            +
                 double* invpowrpplus1,
         
     | 
| 
      
 2048 
     | 
    
         
            +
                 ae_state *_state)
         
     | 
| 
      
 2049 
     | 
    
         
            +
            {
         
     | 
| 
      
 2050 
     | 
    
         
            +
                ae_int_t n;
         
     | 
| 
      
 2051 
     | 
    
         
            +
                double r, r2, r01, invr;
         
     | 
| 
      
 2052 
     | 
    
         
            +
                double sintheta, costheta;
         
     | 
| 
      
 2053 
     | 
    
         
            +
                ae_complex expiphi, expiphi2, expiphi3, expiphi4;
         
     | 
| 
      
 2054 
     | 
    
         
            +
                ae_int_t jj;
         
     | 
| 
      
 2055 
     | 
    
         
            +
                
         
     | 
| 
      
 2056 
     | 
    
         
            +
                /*
         
     | 
| 
      
 2057 
     | 
    
         
            +
                 * Precomputed buffer which is enough for NY up to 16
         
     | 
| 
      
 2058 
     | 
    
         
            +
                 */
         
     | 
| 
      
 2059 
     | 
    
         
            +
                __m256d v_f[16];
         
     | 
| 
      
 2060 
     | 
    
         
            +
                if( ny>16 )
         
     | 
| 
      
 2061 
     | 
    
         
            +
                    return ae_false;
         
     | 
| 
      
 2062 
     | 
    
         
            +
                for(int k=0; k<ny; k++)
         
     | 
| 
      
 2063 
     | 
    
         
            +
                {
         
     | 
| 
      
 2064 
     | 
    
         
            +
                    v_f[k] = _mm256_setzero_pd();
         
     | 
| 
      
 2065 
     | 
    
         
            +
                    f[k] = 0.0;
         
     | 
| 
      
 2066 
     | 
    
         
            +
                }
         
     | 
| 
      
 2067 
     | 
    
         
            +
                
         
     | 
| 
      
 2068 
     | 
    
         
            +
                /*
         
     | 
| 
      
 2069 
     | 
    
         
            +
                 *Convert to spherical polar coordinates.
         
     | 
| 
      
 2070 
     | 
    
         
            +
                 *
         
     | 
| 
      
 2071 
     | 
    
         
            +
                 * NOTE: we make sure that R is non-zero by adding extremely small perturbation
         
     | 
| 
      
 2072 
     | 
    
         
            +
                 */
         
     | 
| 
      
 2073 
     | 
    
         
            +
                r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
         
     | 
| 
      
 2074 
     | 
    
         
            +
                r = ae_sqrt(r2, _state);
         
     | 
| 
      
 2075 
     | 
    
         
            +
                r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
         
     | 
| 
      
 2076 
     | 
    
         
            +
                costheta = d2/r;
         
     | 
| 
      
 2077 
     | 
    
         
            +
                sintheta = r01/r;
         
     | 
| 
      
 2078 
     | 
    
         
            +
                expiphi.x = d0/r01;
         
     | 
| 
      
 2079 
     | 
    
         
            +
                expiphi.y = d1/r01;
         
     | 
| 
      
 2080 
     | 
    
         
            +
                invr = (double)1/r;
         
     | 
| 
      
 2081 
     | 
    
         
            +
                
         
     | 
| 
      
 2082 
     | 
    
         
            +
                /*
         
     | 
| 
      
 2083 
     | 
    
         
            +
                 * prepare precomputed quantities
         
     | 
| 
      
 2084 
     | 
    
         
            +
                 */
         
     | 
| 
      
 2085 
     | 
    
         
            +
                double powsintheta2 = sintheta*sintheta;
         
     | 
| 
      
 2086 
     | 
    
         
            +
                double powsintheta3 = powsintheta2*sintheta;
         
     | 
| 
      
 2087 
     | 
    
         
            +
                double powsintheta4 = powsintheta2*powsintheta2;
         
     | 
| 
      
 2088 
     | 
    
         
            +
                expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
         
     | 
| 
      
 2089 
     | 
    
         
            +
                expiphi2.y = 2*expiphi.x*expiphi.y;
         
     | 
| 
      
 2090 
     | 
    
         
            +
                expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
         
     | 
| 
      
 2091 
     | 
    
         
            +
                expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
         
     | 
| 
      
 2092 
     | 
    
         
            +
                expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
         
     | 
| 
      
 2093 
     | 
    
         
            +
                expiphi4.y = 2*expiphi2.x*expiphi2.y;
         
     | 
| 
      
 2094 
     | 
    
         
            +
                
         
     | 
| 
      
 2095 
     | 
    
         
            +
                /*
         
     | 
| 
      
 2096 
     | 
    
         
            +
                 * Compute far field expansion for a cluster of basis functions f=r
         
     | 
| 
      
 2097 
     | 
    
         
            +
                 *
         
     | 
| 
      
 2098 
     | 
    
         
            +
                 * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
         
     | 
| 
      
 2099 
     | 
    
         
            +
                 *       whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
         
     | 
| 
      
 2100 
     | 
    
         
            +
                 *       We will perform conversion later.
         
     | 
| 
      
 2101 
     | 
    
         
            +
                 */
         
     | 
| 
      
 2102 
     | 
    
         
            +
                __m256d v_costheta      = _mm256_set1_pd(costheta);
         
     | 
| 
      
 2103 
     | 
    
         
            +
                __m256d v_r2            = _mm256_set1_pd(r2);
         
     | 
| 
      
 2104 
     | 
    
         
            +
                __m256d v_invr          = _mm256_set1_pd(invr);
         
     | 
| 
      
 2105 
     | 
    
         
            +
                __m256d v_powsinthetaj  = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
         
     | 
| 
      
 2106 
     | 
    
         
            +
                __m256d v_powsintheta4  = _mm256_set1_pd(powsintheta4);
         
     | 
| 
      
 2107 
     | 
    
         
            +
                __m256d v_expijphix     = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
         
     | 
| 
      
 2108 
     | 
    
         
            +
                __m256d v_expijphiy     = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
         
     | 
| 
      
 2109 
     | 
    
         
            +
                __m256d v_expi4phix     = _mm256_set1_pd(expiphi4.x);
         
     | 
| 
      
 2110 
     | 
    
         
            +
                __m256d v_expi4phiy     = _mm256_set1_pd(expiphi4.y);
         
     | 
| 
      
 2111 
     | 
    
         
            +
                *f = (double)(0);
         
     | 
| 
      
 2112 
     | 
    
         
            +
                for(jj=0; jj<4; jj++)
         
     | 
| 
      
 2113 
     | 
    
         
            +
                {
         
     | 
| 
      
 2114 
     | 
    
         
            +
                    __m256d pnm_cur         = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
         
     | 
| 
      
 2115 
     | 
    
         
            +
                    __m256d v_powrminusj1   = _mm256_set1_pd(invr);
         
     | 
| 
      
 2116 
     | 
    
         
            +
                    for(n=0; n<jj*4; n++)
         
     | 
| 
      
 2117 
     | 
    
         
            +
                        v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
         
     | 
| 
      
 2118 
     | 
    
         
            +
                    for(n=jj*4; n<16; n++)
         
     | 
| 
      
 2119 
     | 
    
         
            +
                    {
         
     | 
| 
      
 2120 
     | 
    
         
            +
                        ae_int_t j0=jj*4;
         
     | 
| 
      
 2121 
     | 
    
         
            +
                        
         
     | 
| 
      
 2122 
     | 
    
         
            +
                        pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
         
     | 
| 
      
 2123 
     | 
    
         
            +
                        pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
         
     | 
| 
      
 2124 
     | 
    
         
            +
                        pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
         
     | 
| 
      
 2125 
     | 
    
         
            +
                        pnm_prev = pnm_cur;
         
     | 
| 
      
 2126 
     | 
    
         
            +
                        pnm_cur  = pnm_new;
         
     | 
| 
      
 2127 
     | 
    
         
            +
                        
         
     | 
| 
      
 2128 
     | 
    
         
            +
                        __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
         
     | 
| 
      
 2129 
     | 
    
         
            +
                        __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
         
     | 
| 
      
 2130 
     | 
    
         
            +
                        __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
         
     | 
| 
      
 2131 
     | 
    
         
            +
                        
         
     | 
| 
      
 2132 
     | 
    
         
            +
                        const double *p_rmodmn = tblrmodmn+n*64+j0;
         
     | 
| 
      
 2133 
     | 
    
         
            +
                        for(int k=0; k<ny; k++)
         
     | 
| 
      
 2134 
     | 
    
         
            +
                        {
         
     | 
| 
      
 2135 
     | 
    
         
            +
                            __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+32)),_mm256_load_pd(p_rmodmn));
         
     | 
| 
      
 2136 
     | 
    
         
            +
                            __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+48)),_mm256_load_pd(p_rmodmn+16));
         
     | 
| 
      
 2137 
     | 
    
         
            +
                            __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
         
     | 
| 
      
 2138 
     | 
    
         
            +
                            v_f[k] = _mm256_add_pd(v_f[k], _mm256_mul_pd(v_powrminusj1, v_z));
         
     | 
| 
      
 2139 
     | 
    
         
            +
                            p_rmodmn += 1024;
         
     | 
| 
      
 2140 
     | 
    
         
            +
                        }
         
     | 
| 
      
 2141 
     | 
    
         
            +
                        v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
         
     | 
| 
      
 2142 
     | 
    
         
            +
                    }
         
     | 
| 
      
 2143 
     | 
    
         
            +
                    __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
         
     | 
| 
      
 2144 
     | 
    
         
            +
                    __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
         
     | 
| 
      
 2145 
     | 
    
         
            +
                    v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
         
     | 
| 
      
 2146 
     | 
    
         
            +
                    v_expijphix = v_expijphix_new;
         
     | 
| 
      
 2147 
     | 
    
         
            +
                    v_expijphiy = v_expijphiy_new;
         
     | 
| 
      
 2148 
     | 
    
         
            +
                }
         
     | 
| 
      
 2149 
     | 
    
         
            +
                        
         
     | 
| 
      
 2150 
     | 
    
         
            +
            for(int t=0; t<ny; t++)
         
     | 
| 
      
 2151 
     | 
    
         
            +
            {
         
     | 
| 
      
 2152 
     | 
    
         
            +
                double ttt[4];
         
     | 
| 
      
 2153 
     | 
    
         
            +
                _mm256_storeu_pd(ttt, v_f[t]);
         
     | 
| 
      
 2154 
     | 
    
         
            +
                for(int k=0; k<4; k++)
         
     | 
| 
      
 2155 
     | 
    
         
            +
                    f[t] += ttt[k];
         
     | 
| 
      
 2156 
     | 
    
         
            +
            }
         
     | 
| 
      
 2157 
     | 
    
         
            +
                
         
     | 
| 
      
 2158 
     | 
    
         
            +
                double r4 = r2*r2;
         
     | 
| 
      
 2159 
     | 
    
         
            +
                double r8 = r4*r4;
         
     | 
| 
      
 2160 
     | 
    
         
            +
                double r16 = r8*r8;
         
     | 
| 
      
 2161 
     | 
    
         
            +
                *invpowrpplus1 = 1/r16;
         
     | 
| 
      
 2162 
     | 
    
         
            +
                
         
     | 
| 
      
 2163 
     | 
    
         
            +
                return ae_true;
         
     | 
| 
      
 2164 
     | 
    
         
            +
            }
         
     | 
| 
      
 2165 
     | 
    
         
            +
             
     | 
| 
      
 2166 
     | 
    
         
            +
            /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_AVX2_INTRINSICS */
         
     | 
| 
      
 2167 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 2168 
     | 
    
         
            +
             
     | 
| 
      
 2169 
     | 
    
         
            +
             
     | 
| 
      
 2170 
     | 
    
         
            +
            }
         
     | 
| 
      
 2171 
     | 
    
         
            +
             
     |