faiss 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +334 -195
- data/vendor/faiss/faiss/Clustering.h +88 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
- data/vendor/faiss/faiss/Index2Layer.h +22 -22
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
- data/vendor/faiss/faiss/IndexFlat.h +35 -46
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
- data/vendor/faiss/faiss/IndexIVF.h +146 -113
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
- data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
- data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
- data/vendor/faiss/faiss/IndexLSH.h +21 -26
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
- data/vendor/faiss/faiss/IndexPQ.h +64 -67
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
- data/vendor/faiss/faiss/IndexRefine.h +22 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
- data/vendor/faiss/faiss/IndexResidual.h +152 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
- data/vendor/faiss/faiss/VectorTransform.h +61 -89
- data/vendor/faiss/faiss/clone_index.cpp +77 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
- data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
- data/vendor/faiss/faiss/impl/io.cpp +75 -94
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +40 -29
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +269 -218
- data/vendor/faiss/faiss/index_factory.h +6 -7
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +301 -310
- data/vendor/faiss/faiss/utils/distances.h +133 -118
- data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +53 -48
- metadata +20 -2
@@ -15,79 +15,67 @@ namespace faiss {
|
|
15
15
|
* Random data generation functions
|
16
16
|
**************************************************/
|
17
17
|
|
18
|
-
RandomGenerator::RandomGenerator
|
19
|
-
: mt((unsigned int)seed) {}
|
18
|
+
RandomGenerator::RandomGenerator(int64_t seed) : mt((unsigned int)seed) {}
|
20
19
|
|
21
|
-
int RandomGenerator::rand_int
|
22
|
-
{
|
20
|
+
int RandomGenerator::rand_int() {
|
23
21
|
return mt() & 0x7fffffff;
|
24
22
|
}
|
25
23
|
|
26
|
-
int64_t RandomGenerator::rand_int64
|
27
|
-
{
|
24
|
+
int64_t RandomGenerator::rand_int64() {
|
28
25
|
return int64_t(rand_int()) | int64_t(rand_int()) << 31;
|
29
26
|
}
|
30
27
|
|
31
|
-
int RandomGenerator::rand_int
|
32
|
-
{
|
28
|
+
int RandomGenerator::rand_int(int max) {
|
33
29
|
return mt() % max;
|
34
30
|
}
|
35
31
|
|
36
|
-
float RandomGenerator::rand_float
|
37
|
-
{
|
32
|
+
float RandomGenerator::rand_float() {
|
38
33
|
return mt() / float(mt.max());
|
39
34
|
}
|
40
35
|
|
41
|
-
double RandomGenerator::rand_double
|
42
|
-
{
|
36
|
+
double RandomGenerator::rand_double() {
|
43
37
|
return mt() / double(mt.max());
|
44
38
|
}
|
45
39
|
|
46
|
-
|
47
40
|
/***********************************************************************
|
48
41
|
* Random functions in this C file only exist because Torch
|
49
42
|
* counterparts are slow and not multi-threaded. Typical use is for
|
50
43
|
* more than 1-100 billion values. */
|
51
44
|
|
52
|
-
|
53
45
|
/* Generate a set of random floating point values such that x[i] in [0,1]
|
54
46
|
multi-threading. For this reason, we rely on re-entreant functions. */
|
55
|
-
void float_rand
|
56
|
-
{
|
47
|
+
void float_rand(float* x, size_t n, int64_t seed) {
|
57
48
|
// only try to parallelize on large enough arrays
|
58
49
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
59
50
|
|
60
|
-
RandomGenerator rng0
|
61
|
-
int a0 = rng0.rand_int
|
51
|
+
RandomGenerator rng0(seed);
|
52
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
62
53
|
|
63
54
|
#pragma omp parallel for
|
64
55
|
for (int64_t j = 0; j < nblock; j++) {
|
65
|
-
|
66
|
-
RandomGenerator rng (a0 + j * b0);
|
56
|
+
RandomGenerator rng(a0 + j * b0);
|
67
57
|
|
68
58
|
const size_t istart = j * n / nblock;
|
69
59
|
const size_t iend = (j + 1) * n / nblock;
|
70
60
|
|
71
61
|
for (size_t i = istart; i < iend; i++)
|
72
|
-
x[i] = rng.rand_float
|
62
|
+
x[i] = rng.rand_float();
|
73
63
|
}
|
74
64
|
}
|
75
65
|
|
76
|
-
|
77
|
-
void float_randn (float * x, size_t n, int64_t seed)
|
78
|
-
{
|
66
|
+
void float_randn(float* x, size_t n, int64_t seed) {
|
79
67
|
// only try to parallelize on large enough arrays
|
80
68
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
81
69
|
|
82
|
-
RandomGenerator rng0
|
83
|
-
int a0 = rng0.rand_int
|
70
|
+
RandomGenerator rng0(seed);
|
71
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
84
72
|
|
85
73
|
#pragma omp parallel for
|
86
74
|
for (int64_t j = 0; j < nblock; j++) {
|
87
|
-
RandomGenerator rng
|
75
|
+
RandomGenerator rng(a0 + j * b0);
|
88
76
|
|
89
77
|
double a = 0, b = 0, s = 0;
|
90
|
-
int state = 0;
|
78
|
+
int state = 0; /* generate two number per "do-while" loop */
|
91
79
|
|
92
80
|
const size_t istart = j * n / nblock;
|
93
81
|
const size_t iend = (j + 1) * n / nblock;
|
@@ -96,96 +84,84 @@ void float_randn (float * x, size_t n, int64_t seed)
|
|
96
84
|
/* Marsaglia's method (see Knuth) */
|
97
85
|
if (state == 0) {
|
98
86
|
do {
|
99
|
-
a = 2.0 * rng.rand_double
|
100
|
-
b = 2.0 * rng.rand_double
|
87
|
+
a = 2.0 * rng.rand_double() - 1;
|
88
|
+
b = 2.0 * rng.rand_double() - 1;
|
101
89
|
s = a * a + b * b;
|
102
90
|
} while (s >= 1.0);
|
103
91
|
x[i] = a * sqrt(-2.0 * log(s) / s);
|
104
|
-
}
|
105
|
-
else
|
92
|
+
} else
|
106
93
|
x[i] = b * sqrt(-2.0 * log(s) / s);
|
107
94
|
state = 1 - state;
|
108
95
|
}
|
109
96
|
}
|
110
97
|
}
|
111
98
|
|
112
|
-
|
113
99
|
/* Integer versions */
|
114
|
-
void int64_rand
|
115
|
-
{
|
100
|
+
void int64_rand(int64_t* x, size_t n, int64_t seed) {
|
116
101
|
// only try to parallelize on large enough arrays
|
117
102
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
118
103
|
|
119
|
-
RandomGenerator rng0
|
120
|
-
int a0 = rng0.rand_int
|
104
|
+
RandomGenerator rng0(seed);
|
105
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
121
106
|
|
122
107
|
#pragma omp parallel for
|
123
108
|
for (int64_t j = 0; j < nblock; j++) {
|
124
|
-
|
125
|
-
RandomGenerator rng (a0 + j * b0);
|
109
|
+
RandomGenerator rng(a0 + j * b0);
|
126
110
|
|
127
111
|
const size_t istart = j * n / nblock;
|
128
112
|
const size_t iend = (j + 1) * n / nblock;
|
129
113
|
for (size_t i = istart; i < iend; i++)
|
130
|
-
x[i] = rng.rand_int64
|
114
|
+
x[i] = rng.rand_int64();
|
131
115
|
}
|
132
116
|
}
|
133
117
|
|
134
|
-
void int64_rand_max
|
135
|
-
{
|
118
|
+
void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed) {
|
136
119
|
// only try to parallelize on large enough arrays
|
137
120
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
138
121
|
|
139
|
-
RandomGenerator rng0
|
140
|
-
int a0 = rng0.rand_int
|
122
|
+
RandomGenerator rng0(seed);
|
123
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
141
124
|
|
142
125
|
#pragma omp parallel for
|
143
126
|
for (int64_t j = 0; j < nblock; j++) {
|
144
|
-
|
145
|
-
RandomGenerator rng (a0 + j * b0);
|
127
|
+
RandomGenerator rng(a0 + j * b0);
|
146
128
|
|
147
129
|
const size_t istart = j * n / nblock;
|
148
130
|
const size_t iend = (j + 1) * n / nblock;
|
149
131
|
for (size_t i = istart; i < iend; i++)
|
150
|
-
x[i] = rng.rand_int64
|
132
|
+
x[i] = rng.rand_int64() % max;
|
151
133
|
}
|
152
134
|
}
|
153
135
|
|
136
|
+
void rand_perm(int* perm, size_t n, int64_t seed) {
|
137
|
+
for (size_t i = 0; i < n; i++)
|
138
|
+
perm[i] = i;
|
154
139
|
|
155
|
-
|
156
|
-
{
|
157
|
-
for (size_t i = 0; i < n; i++) perm[i] = i;
|
158
|
-
|
159
|
-
RandomGenerator rng (seed);
|
140
|
+
RandomGenerator rng(seed);
|
160
141
|
|
161
142
|
for (size_t i = 0; i + 1 < n; i++) {
|
162
|
-
int i2 = i + rng.rand_int
|
143
|
+
int i2 = i + rng.rand_int(n - i);
|
163
144
|
std::swap(perm[i], perm[i2]);
|
164
145
|
}
|
165
146
|
}
|
166
147
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
void byte_rand (uint8_t * x, size_t n, int64_t seed)
|
171
|
-
{
|
148
|
+
void byte_rand(uint8_t* x, size_t n, int64_t seed) {
|
172
149
|
// only try to parallelize on large enough arrays
|
173
150
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
174
151
|
|
175
|
-
RandomGenerator rng0
|
176
|
-
int a0 = rng0.rand_int
|
152
|
+
RandomGenerator rng0(seed);
|
153
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
177
154
|
|
178
155
|
#pragma omp parallel for
|
179
156
|
for (int64_t j = 0; j < nblock; j++) {
|
180
|
-
|
181
|
-
RandomGenerator rng (a0 + j * b0);
|
157
|
+
RandomGenerator rng(a0 + j * b0);
|
182
158
|
|
183
159
|
const size_t istart = j * n / nblock;
|
184
160
|
const size_t iend = (j + 1) * n / nblock;
|
185
161
|
|
186
162
|
size_t i;
|
187
163
|
for (i = istart; i < iend; i++)
|
188
|
-
x[i] = rng.rand_int64
|
164
|
+
x[i] = rng.rand_int64();
|
189
165
|
}
|
190
166
|
}
|
191
167
|
|
@@ -13,9 +13,8 @@
|
|
13
13
|
|
14
14
|
#pragma once
|
15
15
|
|
16
|
-
#include <random>
|
17
16
|
#include <stdint.h>
|
18
|
-
|
17
|
+
#include <random>
|
19
18
|
|
20
19
|
namespace faiss {
|
21
20
|
|
@@ -25,36 +24,34 @@ namespace faiss {
|
|
25
24
|
|
26
25
|
/// random generator that can be used in multithreaded contexts
|
27
26
|
struct RandomGenerator {
|
28
|
-
|
29
27
|
std::mt19937 mt;
|
30
28
|
|
31
29
|
/// random positive integer
|
32
|
-
int rand_int
|
30
|
+
int rand_int();
|
33
31
|
|
34
32
|
/// random int64_t
|
35
|
-
int64_t rand_int64
|
33
|
+
int64_t rand_int64();
|
36
34
|
|
37
35
|
/// generate random integer between 0 and max-1
|
38
|
-
int rand_int
|
36
|
+
int rand_int(int max);
|
39
37
|
|
40
38
|
/// between 0 and 1
|
41
|
-
float rand_float
|
39
|
+
float rand_float();
|
42
40
|
|
43
|
-
double rand_double
|
41
|
+
double rand_double();
|
44
42
|
|
45
|
-
explicit RandomGenerator
|
43
|
+
explicit RandomGenerator(int64_t seed = 1234);
|
46
44
|
};
|
47
45
|
|
48
46
|
/* Generate an array of uniform random floats / multi-threaded implementation */
|
49
|
-
void float_rand
|
50
|
-
void float_randn
|
51
|
-
void int64_rand
|
52
|
-
void byte_rand
|
47
|
+
void float_rand(float* x, size_t n, int64_t seed);
|
48
|
+
void float_randn(float* x, size_t n, int64_t seed);
|
49
|
+
void int64_rand(int64_t* x, size_t n, int64_t seed);
|
50
|
+
void byte_rand(uint8_t* x, size_t n, int64_t seed);
|
53
51
|
// max is actually the maximum value + 1
|
54
|
-
void int64_rand_max
|
52
|
+
void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
|
55
53
|
|
56
54
|
/* random permutation */
|
57
|
-
void rand_perm
|
58
|
-
|
55
|
+
void rand_perm(int* perm, size_t n, int64_t seed);
|
59
56
|
|
60
57
|
} // namespace faiss
|
@@ -7,8 +7,6 @@
|
|
7
7
|
|
8
8
|
#pragma once
|
9
9
|
|
10
|
-
|
11
|
-
|
12
10
|
/** Abstractions for 256-bit registers
|
13
11
|
*
|
14
12
|
* The objective is to separate the different interpretations of the same
|
@@ -20,6 +18,10 @@
|
|
20
18
|
|
21
19
|
#include <faiss/utils/simdlib_avx2.h>
|
22
20
|
|
21
|
+
#elif defined(__aarch64__)
|
22
|
+
|
23
|
+
#include <faiss/utils/simdlib_neon.h>
|
24
|
+
|
23
25
|
#else
|
24
26
|
|
25
27
|
// emulated = all operations are implemented as scalars
|
@@ -7,8 +7,8 @@
|
|
7
7
|
|
8
8
|
#pragma once
|
9
9
|
|
10
|
-
#include <string>
|
11
10
|
#include <cstdint>
|
11
|
+
#include <string>
|
12
12
|
|
13
13
|
#include <immintrin.h>
|
14
14
|
|
@@ -16,7 +16,6 @@
|
|
16
16
|
|
17
17
|
namespace faiss {
|
18
18
|
|
19
|
-
|
20
19
|
/** Simple wrapper around the AVX 256-bit registers
|
21
20
|
*
|
22
21
|
* The objective is to separate the different interpretations of the same
|
@@ -27,36 +26,34 @@ namespace faiss {
|
|
27
26
|
|
28
27
|
/// 256-bit representation without interpretation as a vector
|
29
28
|
struct simd256bit {
|
30
|
-
|
31
|
-
union {
|
29
|
+
union {
|
32
30
|
__m256i i;
|
33
31
|
__m256 f;
|
34
32
|
};
|
35
33
|
|
36
|
-
simd256bit()
|
34
|
+
simd256bit() {}
|
37
35
|
|
38
|
-
explicit simd256bit(__m256i i): i(i) {}
|
36
|
+
explicit simd256bit(__m256i i) : i(i) {}
|
39
37
|
|
40
|
-
explicit simd256bit(__m256 f): f(f) {}
|
38
|
+
explicit simd256bit(__m256 f) : f(f) {}
|
41
39
|
|
42
|
-
explicit simd256bit(const void
|
43
|
-
|
44
|
-
{}
|
40
|
+
explicit simd256bit(const void* x)
|
41
|
+
: i(_mm256_load_si256((__m256i const*)x)) {}
|
45
42
|
|
46
43
|
void clear() {
|
47
44
|
i = _mm256_setzero_si256();
|
48
45
|
}
|
49
46
|
|
50
|
-
void storeu(void
|
51
|
-
_mm256_storeu_si256((__m256i
|
47
|
+
void storeu(void* ptr) const {
|
48
|
+
_mm256_storeu_si256((__m256i*)ptr, i);
|
52
49
|
}
|
53
50
|
|
54
|
-
void loadu(const void
|
51
|
+
void loadu(const void* ptr) {
|
55
52
|
i = _mm256_loadu_si256((__m256i*)ptr);
|
56
53
|
}
|
57
54
|
|
58
|
-
void store(void
|
59
|
-
_mm256_store_si256((__m256i
|
55
|
+
void store(void* ptr) const {
|
56
|
+
_mm256_store_si256((__m256i*)ptr, i);
|
60
57
|
}
|
61
58
|
|
62
59
|
void bin(char bits[257]) const {
|
@@ -73,30 +70,28 @@ struct simd256bit {
|
|
73
70
|
bin(bits);
|
74
71
|
return std::string(bits);
|
75
72
|
}
|
76
|
-
|
77
73
|
};
|
78
74
|
|
79
|
-
|
80
75
|
/// vector of 16 elements in uint16
|
81
|
-
struct simd16uint16: simd256bit {
|
76
|
+
struct simd16uint16 : simd256bit {
|
82
77
|
simd16uint16() {}
|
83
78
|
|
84
|
-
explicit simd16uint16(__m256i i): simd256bit(i) {}
|
79
|
+
explicit simd16uint16(__m256i i) : simd256bit(i) {}
|
85
80
|
|
86
|
-
explicit simd16uint16(int x): simd256bit(_mm256_set1_epi16(x)) {}
|
81
|
+
explicit simd16uint16(int x) : simd256bit(_mm256_set1_epi16(x)) {}
|
87
82
|
|
88
|
-
explicit simd16uint16(uint16_t x): simd256bit(_mm256_set1_epi16(x)) {}
|
83
|
+
explicit simd16uint16(uint16_t x) : simd256bit(_mm256_set1_epi16(x)) {}
|
89
84
|
|
90
|
-
explicit simd16uint16(simd256bit x): simd256bit(x) {}
|
85
|
+
explicit simd16uint16(simd256bit x) : simd256bit(x) {}
|
91
86
|
|
92
|
-
explicit simd16uint16(const uint16_t
|
87
|
+
explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
|
93
88
|
|
94
|
-
std::string elements_to_string(const char
|
89
|
+
std::string elements_to_string(const char* fmt) const {
|
95
90
|
uint16_t bytes[16];
|
96
91
|
storeu((void*)bytes);
|
97
92
|
char res[1000];
|
98
|
-
char
|
99
|
-
for(int i = 0; i < 16; i++) {
|
93
|
+
char* ptr = res;
|
94
|
+
for (int i = 0; i < 16; i++) {
|
100
95
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
101
96
|
}
|
102
97
|
// strip last ,
|
@@ -117,47 +112,47 @@ struct simd16uint16: simd256bit {
|
|
117
112
|
}
|
118
113
|
|
119
114
|
// shift must be known at compile time
|
120
|
-
simd16uint16 operator
|
115
|
+
simd16uint16 operator>>(const int shift) const {
|
121
116
|
return simd16uint16(_mm256_srli_epi16(i, shift));
|
122
117
|
}
|
123
118
|
|
124
119
|
// shift must be known at compile time
|
125
|
-
simd16uint16 operator
|
120
|
+
simd16uint16 operator<<(const int shift) const {
|
126
121
|
return simd16uint16(_mm256_slli_epi16(i, shift));
|
127
122
|
}
|
128
123
|
|
129
|
-
simd16uint16 operator
|
124
|
+
simd16uint16 operator+=(simd16uint16 other) {
|
130
125
|
i = _mm256_add_epi16(i, other.i);
|
131
126
|
return *this;
|
132
127
|
}
|
133
128
|
|
134
|
-
simd16uint16 operator
|
129
|
+
simd16uint16 operator-=(simd16uint16 other) {
|
135
130
|
i = _mm256_sub_epi16(i, other.i);
|
136
131
|
return *this;
|
137
132
|
}
|
138
133
|
|
139
|
-
simd16uint16 operator
|
134
|
+
simd16uint16 operator+(simd16uint16 other) const {
|
140
135
|
return simd16uint16(_mm256_add_epi16(i, other.i));
|
141
136
|
}
|
142
137
|
|
143
|
-
simd16uint16 operator
|
138
|
+
simd16uint16 operator-(simd16uint16 other) const {
|
144
139
|
return simd16uint16(_mm256_sub_epi16(i, other.i));
|
145
140
|
}
|
146
141
|
|
147
|
-
simd16uint16 operator
|
142
|
+
simd16uint16 operator&(simd256bit other) const {
|
148
143
|
return simd16uint16(_mm256_and_si256(i, other.i));
|
149
144
|
}
|
150
145
|
|
151
|
-
simd16uint16 operator
|
146
|
+
simd16uint16 operator|(simd256bit other) const {
|
152
147
|
return simd16uint16(_mm256_or_si256(i, other.i));
|
153
148
|
}
|
154
149
|
|
155
150
|
// returns binary masks
|
156
|
-
simd16uint16 operator
|
151
|
+
simd16uint16 operator==(simd256bit other) const {
|
157
152
|
return simd16uint16(_mm256_cmpeq_epi16(i, other.i));
|
158
153
|
}
|
159
154
|
|
160
|
-
simd16uint16 operator
|
155
|
+
simd16uint16 operator~() const {
|
161
156
|
return simd16uint16(_mm256_xor_si256(i, _mm256_set1_epi32(-1)));
|
162
157
|
}
|
163
158
|
|
@@ -188,7 +183,7 @@ struct simd16uint16: simd256bit {
|
|
188
183
|
}
|
189
184
|
|
190
185
|
// for debugging only
|
191
|
-
uint16_t operator
|
186
|
+
uint16_t operator[](int i) const {
|
192
187
|
ALIGNED(32) uint16_t tab[16];
|
193
188
|
store(tab);
|
194
189
|
return tab[i];
|
@@ -201,7 +196,6 @@ struct simd16uint16: simd256bit {
|
|
201
196
|
void accu_max(simd16uint16 incoming) {
|
202
197
|
i = _mm256_max_epu16(i, incoming.i);
|
203
198
|
}
|
204
|
-
|
205
199
|
};
|
206
200
|
|
207
201
|
// not really a std::min because it returns an elementwise min
|
@@ -213,13 +207,10 @@ inline simd16uint16 max(simd16uint16 a, simd16uint16 b) {
|
|
213
207
|
return simd16uint16(_mm256_max_epu16(a.i, b.i));
|
214
208
|
}
|
215
209
|
|
216
|
-
|
217
|
-
|
218
210
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
219
211
|
// return (a0 + a1, b0 + b1)
|
220
212
|
// TODO find a better name
|
221
213
|
inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
|
222
|
-
|
223
214
|
__m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
|
224
215
|
__m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
|
225
216
|
|
@@ -229,7 +220,6 @@ inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
|
|
229
220
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
230
221
|
// of d0 and d1 with thr
|
231
222
|
inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
232
|
-
|
233
223
|
__m256i max0 = _mm256_max_epu16(d0.i, thr.i);
|
234
224
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
235
225
|
|
@@ -245,9 +235,7 @@ inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
245
235
|
return ge;
|
246
236
|
}
|
247
237
|
|
248
|
-
|
249
238
|
inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
250
|
-
|
251
239
|
__m256i max0 = _mm256_min_epu16(d0.i, thr.i);
|
252
240
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
253
241
|
|
@@ -263,29 +251,26 @@ inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
263
251
|
return ge;
|
264
252
|
}
|
265
253
|
|
266
|
-
|
267
254
|
// vector of 32 unsigned 8-bit integers
|
268
|
-
struct simd32uint8: simd256bit {
|
269
|
-
|
270
|
-
|
255
|
+
struct simd32uint8 : simd256bit {
|
271
256
|
simd32uint8() {}
|
272
257
|
|
273
|
-
explicit simd32uint8(__m256i i): simd256bit(i) {}
|
258
|
+
explicit simd32uint8(__m256i i) : simd256bit(i) {}
|
274
259
|
|
275
|
-
explicit simd32uint8(int x): simd256bit(_mm256_set1_epi8(x)) {}
|
260
|
+
explicit simd32uint8(int x) : simd256bit(_mm256_set1_epi8(x)) {}
|
276
261
|
|
277
|
-
explicit simd32uint8(uint8_t x): simd256bit(_mm256_set1_epi8(x)) {}
|
262
|
+
explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
|
278
263
|
|
279
|
-
explicit simd32uint8(simd256bit x): simd256bit(x) {}
|
264
|
+
explicit simd32uint8(simd256bit x) : simd256bit(x) {}
|
280
265
|
|
281
|
-
explicit simd32uint8(const uint8_t
|
266
|
+
explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
|
282
267
|
|
283
|
-
std::string elements_to_string(const char
|
268
|
+
std::string elements_to_string(const char* fmt) const {
|
284
269
|
uint8_t bytes[32];
|
285
270
|
storeu((void*)bytes);
|
286
271
|
char res[1000];
|
287
|
-
char
|
288
|
-
for(int i = 0; i < 32; i++) {
|
272
|
+
char* ptr = res;
|
273
|
+
for (int i = 0; i < 32; i++) {
|
289
274
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
290
275
|
}
|
291
276
|
// strip last ,
|
@@ -305,11 +290,11 @@ struct simd32uint8: simd256bit {
|
|
305
290
|
i = _mm256_set1_epi8((char)x);
|
306
291
|
}
|
307
292
|
|
308
|
-
simd32uint8 operator
|
293
|
+
simd32uint8 operator&(simd256bit other) const {
|
309
294
|
return simd32uint8(_mm256_and_si256(i, other.i));
|
310
295
|
}
|
311
296
|
|
312
|
-
simd32uint8 operator
|
297
|
+
simd32uint8 operator+(simd32uint8 other) const {
|
313
298
|
return simd32uint8(_mm256_add_epi8(i, other.i));
|
314
299
|
}
|
315
300
|
|
@@ -329,18 +314,17 @@ struct simd32uint8: simd256bit {
|
|
329
314
|
return simd16uint16(_mm256_cvtepu8_epi16(x));
|
330
315
|
}
|
331
316
|
|
332
|
-
simd32uint8 operator
|
317
|
+
simd32uint8 operator+=(simd32uint8 other) {
|
333
318
|
i = _mm256_add_epi8(i, other.i);
|
334
319
|
return *this;
|
335
320
|
}
|
336
321
|
|
337
322
|
// for debugging only
|
338
|
-
uint8_t operator
|
323
|
+
uint8_t operator[](int i) const {
|
339
324
|
ALIGNED(32) uint8_t tab[32];
|
340
325
|
store(tab);
|
341
326
|
return tab[i];
|
342
327
|
}
|
343
|
-
|
344
328
|
};
|
345
329
|
|
346
330
|
// convert with saturation
|
@@ -359,26 +343,24 @@ inline simd32uint8 blendv(simd32uint8 a, simd32uint8 b, simd32uint8 mask) {
|
|
359
343
|
return simd32uint8(_mm256_blendv_epi8(a.i, b.i, mask.i));
|
360
344
|
}
|
361
345
|
|
362
|
-
|
363
|
-
|
364
346
|
/// vector of 8 unsigned 32-bit integers
|
365
|
-
struct simd8uint32: simd256bit {
|
347
|
+
struct simd8uint32 : simd256bit {
|
366
348
|
simd8uint32() {}
|
367
349
|
|
368
|
-
explicit simd8uint32(__m256i i): simd256bit(i) {}
|
350
|
+
explicit simd8uint32(__m256i i) : simd256bit(i) {}
|
369
351
|
|
370
|
-
explicit simd8uint32(uint32_t x): simd256bit(_mm256_set1_epi32(x)) {}
|
352
|
+
explicit simd8uint32(uint32_t x) : simd256bit(_mm256_set1_epi32(x)) {}
|
371
353
|
|
372
|
-
explicit simd8uint32(simd256bit x): simd256bit(x) {}
|
354
|
+
explicit simd8uint32(simd256bit x) : simd256bit(x) {}
|
373
355
|
|
374
|
-
explicit simd8uint32(const uint8_t
|
356
|
+
explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
|
375
357
|
|
376
|
-
std::string elements_to_string(const char
|
358
|
+
std::string elements_to_string(const char* fmt) const {
|
377
359
|
uint32_t bytes[8];
|
378
360
|
storeu((void*)bytes);
|
379
361
|
char res[1000];
|
380
|
-
char
|
381
|
-
for(int i = 0; i < 8; i++) {
|
362
|
+
char* ptr = res;
|
363
|
+
for (int i = 0; i < 8; i++) {
|
382
364
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
383
365
|
}
|
384
366
|
// strip last ,
|
@@ -397,31 +379,28 @@ struct simd8uint32: simd256bit {
|
|
397
379
|
void set1(uint32_t x) {
|
398
380
|
i = _mm256_set1_epi32((int)x);
|
399
381
|
}
|
400
|
-
|
401
382
|
};
|
402
383
|
|
403
|
-
struct simd8float32: simd256bit {
|
404
|
-
|
384
|
+
struct simd8float32 : simd256bit {
|
405
385
|
simd8float32() {}
|
406
386
|
|
387
|
+
explicit simd8float32(simd256bit x) : simd256bit(x) {}
|
407
388
|
|
408
|
-
explicit simd8float32(
|
409
|
-
|
410
|
-
explicit simd8float32(__m256 x): simd256bit(x) {}
|
389
|
+
explicit simd8float32(__m256 x) : simd256bit(x) {}
|
411
390
|
|
412
|
-
explicit simd8float32(float x): simd256bit(_mm256_set1_ps(x)) {}
|
391
|
+
explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
|
413
392
|
|
414
|
-
explicit simd8float32(const float
|
393
|
+
explicit simd8float32(const float* x) : simd256bit(_mm256_load_ps(x)) {}
|
415
394
|
|
416
|
-
simd8float32 operator
|
395
|
+
simd8float32 operator*(simd8float32 other) const {
|
417
396
|
return simd8float32(_mm256_mul_ps(f, other.f));
|
418
397
|
}
|
419
398
|
|
420
|
-
simd8float32 operator
|
399
|
+
simd8float32 operator+(simd8float32 other) const {
|
421
400
|
return simd8float32(_mm256_add_ps(f, other.f));
|
422
401
|
}
|
423
402
|
|
424
|
-
simd8float32 operator
|
403
|
+
simd8float32 operator-(simd8float32 other) const {
|
425
404
|
return simd8float32(_mm256_sub_ps(f, other.f));
|
426
405
|
}
|
427
406
|
|
@@ -429,15 +408,14 @@ struct simd8float32: simd256bit {
|
|
429
408
|
float tab[8];
|
430
409
|
storeu((void*)tab);
|
431
410
|
char res[1000];
|
432
|
-
char
|
433
|
-
for(int i = 0; i < 8; i++) {
|
411
|
+
char* ptr = res;
|
412
|
+
for (int i = 0; i < 8; i++) {
|
434
413
|
ptr += sprintf(ptr, "%g,", tab[i]);
|
435
414
|
}
|
436
415
|
// strip last ,
|
437
416
|
ptr[-1] = 0;
|
438
417
|
return std::string(res);
|
439
418
|
}
|
440
|
-
|
441
419
|
};
|
442
420
|
|
443
421
|
inline simd8float32 hadd(simd8float32 a, simd8float32 b) {
|
@@ -457,5 +435,30 @@ inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
|
|
457
435
|
return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
|
458
436
|
}
|
459
437
|
|
438
|
+
namespace {
|
439
|
+
|
440
|
+
// get even float32's of a and b, interleaved
|
441
|
+
inline simd8float32 geteven(simd8float32 a, simd8float32 b) {
|
442
|
+
return simd8float32(
|
443
|
+
_mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6));
|
444
|
+
}
|
445
|
+
|
446
|
+
// get odd float32's of a and b, interleaved
|
447
|
+
inline simd8float32 getodd(simd8float32 a, simd8float32 b) {
|
448
|
+
return simd8float32(
|
449
|
+
_mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6));
|
450
|
+
}
|
451
|
+
|
452
|
+
// 3 cycles
|
453
|
+
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
454
|
+
inline simd8float32 getlow128(simd8float32 a, simd8float32 b) {
|
455
|
+
return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4));
|
456
|
+
}
|
457
|
+
|
458
|
+
inline simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
|
459
|
+
return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4));
|
460
|
+
}
|
461
|
+
|
462
|
+
} // namespace
|
460
463
|
|
461
464
|
} // namespace faiss
|